You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

1204 lines
51 KiB

  1. # cython: embedsignature=True
  2. #
  3. # Code to read, write and edit VCF files
  4. #
  5. # VCF lines are encoded as a dictionary with these keys (note: all lowercase):
  6. # 'chrom': string
  7. # 'pos': integer
  8. # 'id': string
  9. # 'ref': string
  10. # 'alt': list of strings
  11. # 'qual': integer
  12. # 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"]
  13. # 'info': dictionary of values (see below)
  14. # 'format': list of keys (strings)
  15. # sample keys: dictionary of values (see below)
  16. #
  17. # The sample keys are accessible through vcf.getsamples()
  18. #
  19. # A dictionary of values contains value keys (defined in ##INFO or
  20. # ##FORMAT lines) which map to a list, containing integers, floats,
  21. # strings, or characters. Missing values are replaced by a particular
  22. # value, often -1 or .
  23. #
  24. # Genotypes are not stored as a string, but as a list of 1 or 3
  25. # elements (for haploid and diploid samples), the first (and last) the
  26. # integer representing an allele, and the second the separation
  27. # character. Note that there is just one genotype per sample, but for
  28. # consistency the single element is stored in a list.
  29. #
  30. # Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as
  31. # (key, value) pairs and are accessible through getheader()
  32. #
  33. # The VCF class can be instantiated with a 'regions' variable
  34. # consisting of tuples (chrom,start,end) encoding 0-based half-open
  35. # segments. Only variants with a position inside the segment will be
  36. # parsed. A regions parser is available under parse_regions.
  37. #
  38. # When instantiated, a reference can be passed to the VCF class. This
  39. # may be any class that supports a fetch(chrom, start, end) method.
  40. #
  41. # NOTE: the position that is returned to Python is 0-based, NOT
  42. # 1-based as in the VCF file.
  43. # NOTE: There is also preliminary VCF functionality in the VariantFile class.
  44. #
  45. # TODO:
  46. # only v4.0 writing is complete; alleles are not converted to v3.3 format
  47. #
  48. from collections import namedtuple, defaultdict
  49. from operator import itemgetter
  50. import sys, re, copy, bisect
  51. from libc.stdlib cimport atoi
  52. from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
  53. from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
  54. cimport pysam.libctabix as libctabix
  55. cimport pysam.libctabixproxies as libctabixproxies
  56. from pysam.libcutils cimport force_str
  57. import pysam
  58. gtsRegEx = re.compile("[|/\\\\]")
  59. alleleRegEx = re.compile('^[ACGTN]+$')
  60. # Utility function. Uses 0-based coordinates
  61. def get_sequence(chrom, start, end, fa):
  62. # obtain sequence from .fa file, without truncation
  63. if end<=start: return ""
  64. if not fa: return "N"*(end-start)
  65. if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper()
  66. sequence = fa.fetch(chrom, start, end).upper()
  67. if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence))
  68. return sequence
  69. # Utility function. Parses a region string
  70. def parse_regions( string ):
  71. result = []
  72. for r in string.split(','):
  73. elts = r.split(':')
  74. chrom, start, end = elts[0], 0, 3000000000
  75. if len(elts)==1: pass
  76. elif len(elts)==2:
  77. if len(elts[1])>0:
  78. ielts = elts[1].split('-')
  79. if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r)
  80. try: start, end = int(ielts[0])-1, int(ielts[1])
  81. except: raise ValueError("Don't understand region string '%s'" % r)
  82. else:
  83. raise ValueError("Don't understand region string '%s'" % r)
  84. result.append( (chrom,start,end) )
  85. return result
  86. FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue')
  87. ###########################################################################################################
  88. #
  89. # New class
  90. #
  91. ###########################################################################################################
  92. cdef class VCFRecord(libctabixproxies.TupleProxy):
  93. '''vcf record.
  94. initialized from data and vcf meta
  95. '''
  96. cdef vcf
  97. cdef char * contig
  98. cdef uint32_t pos
  99. def __init__(self, vcf):
  100. self.vcf = vcf
  101. self.encoding = vcf.encoding
  102. # if len(data) != len(self.vcf._samples):
  103. # self.vcf.error(str(data),
  104. # self.BAD_NUMBER_OF_COLUMNS,
  105. # "expected %s for %s samples (%s), got %s" % \
  106. # (len(self.vcf._samples),
  107. # len(self.vcf._samples),
  108. # self.vcf._samples,
  109. # len(data)))
  110. def __cinit__(self, vcf):
  111. # start indexed access at genotypes
  112. self.offset = 9
  113. self.vcf = vcf
  114. self.encoding = vcf.encoding
  115. def error(self, line, error, opt=None):
  116. '''raise error.'''
  117. # pass to vcf file for error handling
  118. return self.vcf.error(line, error, opt)
  119. cdef update(self, char * buffer, size_t nbytes):
  120. '''update internal data.
  121. nbytes does not include the terminal '\0'.
  122. '''
  123. libctabixproxies.TupleProxy.update(self, buffer, nbytes)
  124. self.contig = self.fields[0]
  125. # vcf counts from 1 - correct here
  126. self.pos = atoi(self.fields[1]) - 1
  127. def __len__(self):
  128. return max(0, self.nfields - 9)
  129. property contig:
  130. def __get__(self): return self.contig
  131. property pos:
  132. def __get__(self): return self.pos
  133. property id:
  134. def __get__(self): return self.fields[2]
  135. property ref:
  136. def __get__(self):
  137. return self.fields[3]
  138. property alt:
  139. def __get__(self):
  140. # convert v3.3 to v4.0 alleles below
  141. alt = self.fields[4]
  142. if alt == ".": alt = []
  143. else: alt = alt.upper().split(',')
  144. return alt
  145. property qual:
  146. def __get__(self):
  147. qual = self.fields[5]
  148. if qual == b".": qual = -1
  149. else:
  150. try: qual = float(qual)
  151. except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL)
  152. return qual
  153. property filter:
  154. def __get__(self):
  155. f = self.fields[6]
  156. # postpone checking that filters exist. Encode missing filter or no filtering as empty list
  157. if f == b"." or f == b"PASS" or f == b"0": return []
  158. else: return f.split(';')
  159. property info:
  160. def __get__(self):
  161. col = self.fields[7]
  162. # dictionary of keys, and list of values
  163. info = {}
  164. if col != b".":
  165. for blurp in col.split(';'):
  166. elts = blurp.split('=')
  167. if len(elts) == 1: v = None
  168. elif len(elts) == 2: v = elts[1]
  169. else: self.vcf.error(str(self),self.ERROR_INFO_STRING)
  170. info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf))
  171. return info
  172. property format:
  173. def __get__(self):
  174. return self.fields[8].split(':')
  175. property samples:
  176. def __get__(self):
  177. return self.vcf._samples
  178. def __getitem__(self, key):
  179. # parse sample columns
  180. values = self.fields[self.vcf._sample2column[key]].split(':')
  181. alt = self.alt
  182. format = self.format
  183. if len(values) > len(format):
  184. self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\
  185. (len(values),key,len(format)))
  186. result = {}
  187. for idx in range(len(format)):
  188. expected = self.vcf.get_expected(format[idx], self.vcf._format, alt)
  189. if idx < len(values): value = values[idx]
  190. else:
  191. if expected == -1: value = "."
  192. else: value = ",".join(["."]*expected)
  193. result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data))
  194. if expected != -1 and len(result[format[idx]]) != expected:
  195. self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS,
  196. "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]]))
  197. if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]]))
  198. result[format[idx]] = result[format[idx]][:expected]
  199. return result
  200. cdef class asVCFRecord(libctabix.Parser):
  201. '''converts a :term:`tabix row` into a VCF record.'''
  202. cdef vcffile
  203. def __init__(self, vcffile):
  204. self.vcffile = vcffile
  205. cdef parse(self, char * buffer, int len):
  206. cdef VCFRecord r
  207. r = VCFRecord(self.vcffile)
  208. r.copy(buffer, len)
  209. return r
  210. class VCF(object):
  211. # types
  212. NT_UNKNOWN = 0
  213. NT_NUMBER = 1
  214. NT_ALLELES = 2
  215. NT_NR_ALLELES = 3
  216. NT_GENOTYPES = 4
  217. NT_PHASED_GENOTYPES = 5
  218. _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier",
  219. 1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string",
  220. 2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s",
  221. 3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)",
  222. 4:"POS_NOT_NUMERICAL:Position column is not numerical",
  223. 5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field",
  224. 6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF",
  225. 7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF",
  226. 8:"POS_NOT_POSITIVE:Position field must be >0",
  227. 9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'",
  228. 10:"ERROR_INFO_STRING:Error while parsing info field",
  229. 11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)",
  230. 12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s",
  231. 13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string",
  232. 14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header",
  233. 15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header",
  234. 16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)",
  235. 17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)",
  236. 18:"BAD_GENOTYPE:Cannot parse genotype (%s)",
  237. 19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)",
  238. 20:"MISSING_REF:Reference allele missing",
  239. 21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)",
  240. 22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets",
  241. 23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes",
  242. 24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields",
  243. 25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs",
  244. 26:"WRONG_REF:Wrong reference %s",
  245. 27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data",
  246. 28:"BAD_CHR_TAG:Error calculating chr tag for %s",
  247. 29:"ZERO_LENGTH_ALLELE:Found zero-length allele",
  248. 30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base",
  249. 31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'",
  250. 32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s",
  251. 33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value",
  252. }
  253. # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields
  254. _header = []
  255. # version number; 33=v3.3; 40=v4.0
  256. _version = 40
  257. # info, filter and format data
  258. _info = {}
  259. _filter = {}
  260. _format = {}
  261. # header; and required columns
  262. _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
  263. _samples = []
  264. # control behaviour
  265. _ignored_errors = set([11,31]) # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD
  266. _warn_errors = set([])
  267. _leftalign = False
  268. # reference sequence
  269. _reference = None
  270. # regions to include; None includes everything
  271. _regions = None
  272. # statefull stuff
  273. _lineno = -1
  274. _line = None
  275. _lines = None
  276. def __init__(self, _copy=None, reference=None, regions=None,
  277. lines=None, leftalign=False):
  278. # make error identifiers accessible by name
  279. for id in self._errors.keys():
  280. self.__dict__[self._errors[id].split(':')[0]] = id
  281. if _copy != None:
  282. self._leftalign = _copy._leftalign
  283. self._header = _copy._header[:]
  284. self._version = _copy._version
  285. self._info = copy.deepcopy(_copy._info)
  286. self._filter = copy.deepcopy(_copy._filter)
  287. self._format = copy.deepcopy(_copy._format)
  288. self._samples = _copy._samples[:]
  289. self._sample2column = copy.deepcopy(_copy._sample2column)
  290. self._ignored_errors = copy.deepcopy(_copy._ignored_errors)
  291. self._warn_errors = copy.deepcopy(_copy._warn_errors)
  292. self._reference = _copy._reference
  293. self._regions = _copy._regions
  294. if reference: self._reference = reference
  295. if regions: self._regions = regions
  296. if leftalign: self._leftalign = leftalign
  297. self._lines = lines
  298. self.encoding = "ascii"
  299. self.tabixfile = None
  300. def error(self,line,error,opt=None):
  301. if error in self._ignored_errors: return
  302. errorlabel, errorstring = self._errors[error].split(':')
  303. if opt: errorstring = errorstring % opt
  304. errwarn = ["Error","Warning"][error in self._warn_errors]
  305. errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring)
  306. if error in self._warn_errors: return
  307. raise ValueError(errorstring)
  308. def parse_format(self,line,format,filter=False):
  309. if self._version == 40:
  310. if not format.startswith('<'):
  311. self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
  312. format = "<"+format
  313. if not format.endswith('>'):
  314. self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
  315. format += ">"
  316. format = format[1:-1]
  317. data = {'id':None,'number':None,'type':None,'descr':None}
  318. idx = 0
  319. while len(format.strip())>0:
  320. elts = format.strip().split(',')
  321. first, rest = elts[0], ','.join(elts[1:])
  322. if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')):
  323. if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS)
  324. if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
  325. first = ["ID=","Number=","Type=","Description="][idx] + first
  326. if first.startswith('ID='): data['id'] = first.split('=')[1]
  327. elif first.startswith('Number='): data['number'] = first.split('=')[1]
  328. elif first.startswith('Type='): data['type'] = first.split('=')[1]
  329. elif first.startswith('Description='):
  330. elts = format.split('"')
  331. if len(elts)<3:
  332. self.error(line,self.FORMAT_MISSING_QUOTES)
  333. elts = first.split('=') + [rest]
  334. data['descr'] = elts[1]
  335. rest = '"'.join(elts[2:])
  336. if rest.startswith(','): rest = rest[1:]
  337. else:
  338. self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
  339. format = rest
  340. idx += 1
  341. if filter and idx==1: idx=3 # skip number and type fields for FILTER format strings
  342. if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
  343. if 'descr' not in data:
  344. # missing description
  345. self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
  346. data['descr'] = ""
  347. if not data['type'] and not data['number']:
  348. # fine, ##filter format
  349. return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.')
  350. if not data['type'] in ["Integer","Float","Character","String","Flag"]:
  351. self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
  352. # I would like a missing-value field, but it isn't there
  353. if data['type'] in ['Integer','Float']: data['missing'] = None # Do NOT use arbitrary int/float as missing value
  354. else: data['missing'] = '.'
  355. if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
  356. try:
  357. n = int(data['number'])
  358. t = self.NT_NUMBER
  359. except ValueError:
  360. n = -1
  361. if data['number'] == '.': t = self.NT_UNKNOWN
  362. elif data['number'] == '#alleles': t = self.NT_ALLELES
  363. elif data['number'] == '#nonref_alleles': t = self.NT_NR_ALLELES
  364. elif data['number'] == '#genotypes': t = self.NT_GENOTYPES
  365. elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
  366. elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
  367. # abbreviations added in VCF version v4.1
  368. elif data['number'] == 'A': t = self.NT_ALLELES
  369. elif data['number'] == 'G': t = self.NT_GENOTYPES
  370. else:
  371. self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
  372. # if number is 0 - type must be Flag
  373. if n == 0 and data['type'] != 'Flag':
  374. self.error( line, self.ZERO_FOR_NON_FLAG_FIELD)
  375. # force type 'Flag' if no number
  376. data['type'] = 'Flag'
  377. return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing'])
  378. def format_format( self, fmt, filter=False ):
  379. values = [('ID',fmt.id)]
  380. if fmt.number != None and not filter:
  381. if fmt.numbertype == self.NT_UNKNOWN: nmb = "."
  382. elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number)
  383. elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles"
  384. elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles"
  385. elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes"
  386. elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes"
  387. else:
  388. raise ValueError("Unknown number type encountered: %s" % fmt.numbertype)
  389. values.append( ('Number',nmb) )
  390. values.append( ('Type', fmt.type) )
  391. values.append( ('Description', '"' + fmt.description + '"') )
  392. if self._version == 33:
  393. format = ",".join([v for k,v in values])
  394. else:
  395. format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">"
  396. return format
  397. def get_expected(self, format, formatdict, alt):
  398. fmt = formatdict[format]
  399. if fmt.numbertype == self.NT_UNKNOWN: return -1
  400. if fmt.numbertype == self.NT_NUMBER: return fmt.number
  401. if fmt.numbertype == self.NT_ALLELES: return len(alt)+1
  402. if fmt.numbertype == self.NT_NR_ALLELES: return len(alt)
  403. if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2
  404. if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1)
  405. return 0
  406. def _add_definition(self, formatdict, key, data, line ):
  407. if key in formatdict: return
  408. self.error(line,self.ERROR_UNKNOWN_KEY,key)
  409. if data == None:
  410. formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".")
  411. return
  412. if data == []: data = [""] # unsure what type -- say string
  413. if type(data[0]) == type(0.0):
  414. formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None)
  415. return
  416. if type(data[0]) == type(0):
  417. formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None)
  418. return
  419. formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".")
  420. # todo: trim trailing missing values
  421. def format_formatdata( self, data, format, key=True, value=True, separator=":" ):
  422. output, sdata = [], []
  423. if type(data) == type([]): # for FORMAT field, make data with dummy values
  424. d = {}
  425. for k in data: d[k] = []
  426. data = d
  427. # convert missing values; and silently add definitions if required
  428. for k in data:
  429. self._add_definition( format, k, data[k], "(output)" )
  430. for idx,v in enumerate(data[k]):
  431. if v == format[k].missingvalue: data[k][idx] = "."
  432. # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string
  433. for k in data:
  434. if k != 'GT': sdata.append( (k,data[k]) )
  435. sdata.sort()
  436. if 'GT' in data:
  437. sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata
  438. for k,v in sdata:
  439. if v == []: v = None
  440. if key and value:
  441. if v != None: output.append( k+"="+','.join(map(str,v)) )
  442. else: output.append( k )
  443. elif key: output.append(k)
  444. elif value:
  445. if v != None: output.append( ','.join(map(str,v)) )
  446. else: output.append( "." ) # should not happen
  447. # snip off trailing missing data
  448. while len(output) > 1:
  449. last = output[-1].replace(',','').replace('.','')
  450. if len(last)>0: break
  451. output = output[:-1]
  452. return separator.join(output)
  453. def enter_default_format(self):
  454. for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'),
  455. FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1),
  456. FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'),
  457. FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
  458. FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
  459. FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1),
  460. FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'),
  461. FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'),
  462. FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'),
  463. FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1), # unknown number, since may be haploid
  464. FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'),
  465. FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1),
  466. FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1),
  467. FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1),
  468. ]:
  469. if f.id not in self._format:
  470. self._format[f.id] = f
  471. def parse_header(self, line):
  472. assert line.startswith('##')
  473. elts = line[2:].split('=')
  474. key = elts[0].strip()
  475. value = '='.join(elts[1:]).strip()
  476. if key == "fileformat":
  477. if value == "VCFv3.3":
  478. self._version = 33
  479. elif value == "VCFv4.0":
  480. self._version = 40
  481. elif value == "VCFv4.1":
  482. # AH - for testing
  483. self._version = 40
  484. elif value == "VCFv4.2":
  485. # AH - for testing
  486. self._version = 40
  487. else:
  488. self.error(line,self.UNKNOWN_FORMAT_STRING)
  489. elif key == "INFO":
  490. f = self.parse_format(line, value)
  491. self._info[ f.id ] = f
  492. elif key == "FILTER":
  493. f = self.parse_format(line, value, filter=True)
  494. self._filter[ f.id ] = f
  495. elif key == "FORMAT":
  496. f = self.parse_format(line, value)
  497. self._format[ f.id ] = f
  498. else:
  499. # keep other keys in the header field
  500. self._header.append( (key,value) )
  501. def write_header( self, stream ):
  502. stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10))
  503. for key,value in self._header: stream.write("##%s=%s\n" % (key,value))
  504. for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]:
  505. for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER"))))
  506. def parse_heading( self, line ):
  507. assert line.startswith('#')
  508. assert not line.startswith('##')
  509. headings = line[1:].split('\t')
  510. # test for 8, as FORMAT field might be missing
  511. if len(headings)==1 and len(line[1:].split()) >= 8:
  512. self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS)
  513. headings = line[1:].split()
  514. for i,s in enumerate(self._required):
  515. if len(headings)<=i or headings[i] != s:
  516. if len(headings) <= i:
  517. err = "(%sth entry not found)" % (i+1)
  518. else:
  519. err = "(found %s, expected %s)" % (headings[i],s)
  520. #self.error(line,self.BADLY_FORMATTED_HEADING,err)
  521. # allow FORMAT column to be absent
  522. if len(headings) == 8:
  523. headings.append("FORMAT")
  524. else:
  525. self.error(line,self.BADLY_FORMATTED_HEADING,err)
  526. self._samples = headings[9:]
  527. self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] )
  528. def write_heading( self, stream ):
  529. stream.write("#" + "\t".join(self._required + self._samples) + "\n")
  530. def convertGT(self, GTstring):
  531. if GTstring == ".": return ["."]
  532. try:
  533. gts = gtsRegEx.split(GTstring)
  534. if len(gts) == 1: return [int(gts[0])]
  535. if len(gts) != 2: raise ValueError()
  536. if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]]
  537. return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])]
  538. except ValueError:
  539. self.error(self._line,self.BAD_GENOTYPE,GTstring)
  540. return [".","|","."]
  541. def convertGTback(self, GTdata):
  542. return ''.join(map(str,GTdata))
  543. def parse_formatdata( self, key, value, formatdict, line ):
  544. # To do: check that the right number of values is present
  545. f = formatdict.get(key,None)
  546. if f == None:
  547. self._add_definition(formatdict, key, value, line )
  548. f = formatdict[key]
  549. if f.type == "Flag":
  550. if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE)
  551. return []
  552. values = value.split(',')
  553. # deal with trailing data in some early VCF files
  554. if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1:
  555. self.error(line,self.ERROR_TRAILING_DATA,values[-1])
  556. values[-1] = values[-1].split(';')[0]
  557. if f.type == "Integer":
  558. for idx,v in enumerate(values):
  559. try:
  560. if v == ".": values[idx] = f.missingvalue
  561. else: values[idx] = int(v)
  562. except:
  563. self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values)))
  564. return [0] * len(values)
  565. return values
  566. elif f.type == "String":
  567. self._line = line
  568. if f.id == "GT": values = list(map( self.convertGT, values ))
  569. return values
  570. elif f.type == "Character":
  571. for v in values:
  572. if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR)
  573. return values
  574. elif f.type == "Float":
  575. for idx,v in enumerate(values):
  576. if v == ".": values[idx] = f.missingvalue
  577. try: return list(map(float,values))
  578. except:
  579. self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values)))
  580. return [0.0] * len(values)
  581. else:
  582. # can't happen
  583. self.error(line,self.ERROR_INFO_STRING)
  584. def inregion(self, chrom, pos):
  585. if not self._regions: return True
  586. for r in self._regions:
  587. if r[0] == chrom and r[1] <= pos < r[2]: return True
  588. return False
  589. def parse_data( self, line, lineparse=False ):
  590. cols = line.split('\t')
  591. if len(cols) != len(self._samples)+9:
  592. # gracefully deal with absent FORMAT column
  593. # and those missing samples
  594. if len(cols) == 8:
  595. cols.append("")
  596. else:
  597. self.error(line,
  598. self.BAD_NUMBER_OF_COLUMNS,
  599. "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols)))
  600. chrom = cols[0]
  601. # get 0-based position
  602. try: pos = int(cols[1])-1
  603. except: self.error(line,self.POS_NOT_NUMERICAL)
  604. if pos < 0: self.error(line,self.POS_NOT_POSITIVE)
  605. # implement filtering
  606. if not self.inregion(chrom,pos): return None
  607. # end of first-pass parse for sortedVCF
  608. if lineparse: return chrom, pos, line
  609. id = cols[2]
  610. ref = cols[3].upper()
  611. if ref == ".":
  612. self.error(line,self.MISSING_REF)
  613. if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
  614. else: ref = ""
  615. else:
  616. for c in ref:
  617. if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF)
  618. if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference)
  619. # make sure reference is sane
  620. if self._reference:
  621. left = max(0,pos-100)
  622. faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference)
  623. faref = faref_leftflank[pos-left:]
  624. if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
  625. ref = faref
  626. # convert v3.3 to v4.0 alleles below
  627. if cols[4] == ".": alt = []
  628. else: alt = cols[4].upper().split(',')
  629. if cols[5] == ".": qual = -1
  630. else:
  631. try: qual = float(cols[5])
  632. except: self.error(line,self.QUAL_NOT_NUMERICAL)
  633. # postpone checking that filters exist. Encode missing filter or no filtering as empty list
  634. if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = []
  635. else: filter = cols[6].split(';')
  636. # dictionary of keys, and list of values
  637. info = {}
  638. if cols[7] != ".":
  639. for blurp in cols[7].split(';'):
  640. elts = blurp.split('=')
  641. if len(elts) == 1: v = None
  642. elif len(elts) == 2: v = elts[1]
  643. else: self.error(line,self.ERROR_INFO_STRING)
  644. info[elts[0]] = self.parse_formatdata(elts[0],
  645. v,
  646. self._info,
  647. line)
  648. # Gracefully deal with absent FORMAT column
  649. if cols[8] == "": format = []
  650. else: format = cols[8].split(':')
  651. # check: all filters are defined
  652. for f in filter:
  653. if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f)
  654. # check: format fields are defined
  655. if self._format:
  656. for f in format:
  657. if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f)
  658. # convert v3.3 alleles
  659. if self._version == 33:
  660. if len(ref) != 1: self.error(line,self.V33_BAD_REF)
  661. newalts = []
  662. have_deletions = False
  663. for a in alt:
  664. if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference
  665. elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference
  666. elif a.startswith('D'): # allow D<seq> and D<num>
  667. have_deletions = True
  668. try:
  669. l = int(a[1:]) # throws ValueError if sequence
  670. if len(ref) < l: # add to reference if necessary
  671. addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
  672. ref += addns
  673. for i,na in enumerate(newalts): newalts[i] = na+addns
  674. a = ref[l:] # new deletion, deleting pos...pos+l
  675. except ValueError:
  676. s = a[1:]
  677. if len(ref) < len(s): # add Ns to reference if necessary
  678. addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
  679. if not s.endswith(addns) and addns != 'N'*len(addns):
  680. self.error(line,self.V33_UNMATCHED_DELETION,
  681. "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
  682. ref += addns
  683. for i,na in enumerate(newalts): newalts[i] = na+addns
  684. a = ref[len(s):] # new deletion, deleting from pos
  685. else:
  686. self.error(line,self.V33_BAD_ALLELE)
  687. newalts.append(a)
  688. alt = newalts
  689. # deletion alleles exist, add dummy 1st reference allele, and account for leading base
  690. if have_deletions:
  691. if pos == 0:
  692. # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
  693. addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
  694. ref += addn
  695. alt = [allele+addn for allele in alt]
  696. else:
  697. addn = get_sequence(chrom,pos-1,pos,self._reference)
  698. ref = addn + ref
  699. alt = [addn + allele for allele in alt]
  700. pos -= 1
  701. else:
  702. # format v4.0 -- just check for nucleotides
  703. for allele in alt:
  704. if not alleleRegEx.match(allele):
  705. self.error(line,self.V40_BAD_ALLELE,allele)
  706. # check for leading nucleotide in indel calls
  707. for allele in alt:
  708. if len(allele) != len(ref):
  709. if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE)
  710. if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
  711. self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE)
  712. # trim trailing bases in alleles
  713. # AH: not certain why trimming this needs to be added
  714. # disabled now for unit testing
  715. # if alt:
  716. # for i in range(1,min(len(ref),min(map(len,alt)))):
  717. # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
  718. # break
  719. # ref, alt = ref[:-1], [allele[:-1] for allele in alt]
  720. # left-align alleles, if a reference is available
  721. if self._leftalign and self._reference:
  722. while left < pos:
  723. movable = True
  724. for allele in alt:
  725. if len(allele) > len(ref):
  726. longest, shortest = allele, ref
  727. else:
  728. longest, shortest = ref, allele
  729. if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
  730. movable = False
  731. if longest[-1].upper() != longest[len(shortest)-1].upper():
  732. movable = False
  733. if not movable:
  734. break
  735. ref = ref[:-1]
  736. alt = [allele[:-1] for allele in alt]
  737. if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
  738. ref = faref_leftflank[pos-left-1] + ref
  739. alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
  740. pos -= 1
  741. # parse sample columns
  742. samples = []
  743. for sample in cols[9:]:
  744. dict = {}
  745. values = sample.split(':')
  746. if len(values) > len(format):
  747. self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format)))
  748. for idx in range(len(format)):
  749. expected = self.get_expected(format[idx], self._format, alt)
  750. if idx < len(values): value = values[idx]
  751. else:
  752. if expected == -1: value = "."
  753. else: value = ",".join(["."]*expected)
  754. dict[format[idx]] = self.parse_formatdata(format[idx],
  755. value,
  756. self._format,
  757. line)
  758. if expected != -1 and len(dict[format[idx]]) != expected:
  759. self.error(line,self.BAD_NUMBER_OF_PARAMETERS,
  760. "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]]))
  761. if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]]))
  762. dict[format[idx]] = dict[format[idx]][:expected]
  763. samples.append( dict )
  764. # done
  765. d = {'chrom':chrom,
  766. 'pos':pos, # return 0-based position
  767. 'id':id,
  768. 'ref':ref,
  769. 'alt':alt,
  770. 'qual':qual,
  771. 'filter':filter,
  772. 'info':info,
  773. 'format':format}
  774. for key,value in zip(self._samples,samples):
  775. d[key] = value
  776. return d
  777. def write_data(self, stream, data):
  778. required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples
  779. for k in required:
  780. if k not in data: raise ValueError("Required key %s not found in data" % str(k))
  781. if data['alt'] == []: alt = "."
  782. else: alt = ",".join(data['alt'])
  783. if data['filter'] == None: filter = "."
  784. elif data['filter'] == []:
  785. if self._version == 33: filter = "0"
  786. else: filter = "PASS"
  787. else: filter = ';'.join(data['filter'])
  788. if data['qual'] == -1: qual = "."
  789. else: qual = str(data['qual'])
  790. output = [data['chrom'],
  791. str(data['pos']+1), # change to 1-based position
  792. data['id'],
  793. data['ref'],
  794. alt,
  795. qual,
  796. filter,
  797. self.format_formatdata(
  798. data['info'], self._info, separator=";"),
  799. self.format_formatdata(
  800. data['format'], self._format, value=False)]
  801. for s in self._samples:
  802. output.append(self.format_formatdata(
  803. data[s], self._format, key=False))
  804. stream.write( "\t".join(output) + "\n" )
  805. def _parse_header(self, stream):
  806. self._lineno = 0
  807. for line in stream:
  808. line = force_str(line, self.encoding)
  809. self._lineno += 1
  810. if line.startswith('##'):
  811. self.parse_header(line.strip())
  812. elif line.startswith('#'):
  813. self.parse_heading(line.strip())
  814. self.enter_default_format()
  815. else:
  816. break
  817. return line
  818. def _parse(self, line, stream):
  819. # deal with files with header only
  820. if line.startswith("##"): return
  821. if len(line.strip()) > 0:
  822. d = self.parse_data( line.strip() )
  823. if d: yield d
  824. for line in stream:
  825. self._lineno += 1
  826. if self._lines and self._lineno > self._lines: raise StopIteration
  827. d = self.parse_data( line.strip() )
  828. if d: yield d
  829. ######################################################################################################
  830. #
  831. # API follows
  832. #
  833. ######################################################################################################
  834. def getsamples(self):
  835. """ List of samples in VCF file """
  836. return self._samples
  837. def setsamples(self,samples):
  838. """ List of samples in VCF file """
  839. self._samples = samples
  840. def getheader(self):
  841. """ List of header key-value pairs (strings) """
  842. return self._header
  843. def setheader(self,header):
  844. """ List of header key-value pairs (strings) """
  845. self._header = header
  846. def getinfo(self):
  847. """ Dictionary of ##INFO tags, as VCF.FORMAT values """
  848. return self._info
  849. def setinfo(self,info):
  850. """ Dictionary of ##INFO tags, as VCF.FORMAT values """
  851. self._info = info
  852. def getformat(self):
  853. """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
  854. return self._format
  855. def setformat(self,format):
  856. """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
  857. self._format = format
  858. def getfilter(self):
  859. """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
  860. return self._filter
  861. def setfilter(self,filter):
  862. """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
  863. self._filter = filter
  864. def setversion(self, version):
  865. if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files")
  866. self._version = version
  867. def setregions(self, regions):
  868. self._regions = regions
  869. def setreference(self, ref):
  870. """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """
  871. self._reference = ref
  872. def ignoreerror(self, errorstring):
  873. try: self._ignored_errors.add(self.__dict__[errorstring])
  874. except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
  875. def warnerror(self, errorstring):
  876. try: self._warn_errors.add(self.__dict__[errorstring])
  877. except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
  878. def parse(self, stream):
  879. """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """
  880. last_line = self._parse_header(stream)
  881. # now return a generator that does the actual work. In this way the pre-processing is done
  882. # before the first piece of data is yielded
  883. return self._parse(last_line, stream)
  884. def write(self, stream, datagenerator):
  885. """ Writes a VCF file to a stream, using a data generator (or list) """
  886. self.write_header(stream)
  887. self.write_heading(stream)
  888. for data in datagenerator: self.write_data(stream,data)
  889. def writeheader(self, stream):
  890. """ Writes a VCF header """
  891. self.write_header(stream)
  892. self.write_heading(stream)
  893. def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2):
  894. """ Utility function: compares two calls for equality """
  895. # a variant should always be assigned to a unique position, one base before
  896. # the leftmost position of the alignment gap. If this rule is implemented
  897. # correctly, the two positions must be equal for the calls to be identical.
  898. if pos1 != pos2: return False
  899. # from both calls, trim rightmost bases when identical. Do this safely, i.e.
  900. # only when the reference bases are not Ns
  901. while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]:
  902. ref1 = ref1[:-1]
  903. alt1 = alt1[:-1]
  904. while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]:
  905. ref2 = ref2[:-1]
  906. alt2 = alt2[:-1]
  907. # now, the alternative alleles must be identical
  908. return alt1 == alt2
  909. ###########################################################################################################
  910. ###########################################################################################################
  911. ## API functions added by Andreas
  912. ###########################################################################################################
  913. def connect(self, filename, encoding="ascii"):
  914. '''connect to tabix file.'''
  915. self.encoding=encoding
  916. self.tabixfile = pysam.Tabixfile(filename, encoding=encoding)
  917. self._parse_header(self.tabixfile.header)
  918. def __del__(self):
  919. self.close()
  920. self.tabixfile = None
  921. def close(self):
  922. if self.tabixfile:
  923. self.tabixfile.close()
  924. self.tabixfile = None
  925. def fetch(self,
  926. reference=None,
  927. start=None,
  928. end=None,
  929. region=None ):
  930. """ Parse a stream of VCF-formatted lines.
  931. Initializes class instance and return generator """
  932. return self.tabixfile.fetch(
  933. reference,
  934. start,
  935. end,
  936. region,
  937. parser = asVCFRecord(self))
  938. def validate(self, record):
  939. '''validate vcf record.
  940. returns a validated record.
  941. '''
  942. raise NotImplementedError("needs to be checked")
  943. chrom, pos = record.chrom, record.pos
  944. # check reference
  945. ref = record.ref
  946. if ref == ".":
  947. self.error(str(record),self.MISSING_REF)
  948. if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
  949. else: ref = ""
  950. else:
  951. for c in ref:
  952. if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF)
  953. if "N" in ref: ref = get_sequence(chrom,
  954. pos,
  955. pos+len(ref),
  956. self._reference)
  957. # make sure reference is sane
  958. if self._reference:
  959. left = max(0,self.pos-100)
  960. faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference)
  961. faref = faref_leftflank[pos-left:]
  962. if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
  963. ref = faref
  964. # check: format fields are defined
  965. for f in record.format:
  966. if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f)
  967. # check: all filters are defined
  968. for f in record.filter:
  969. if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f)
  970. # convert v3.3 alleles
  971. if self._version == 33:
  972. if len(ref) != 1: self.error(str(record),self.V33_BAD_REF)
  973. newalts = []
  974. have_deletions = False
  975. for a in alt:
  976. if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference
  977. elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference
  978. elif a.startswith('D'): # allow D<seq> and D<num>
  979. have_deletions = True
  980. try:
  981. l = int(a[1:]) # throws ValueError if sequence
  982. if len(ref) < l: # add to reference if necessary
  983. addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
  984. ref += addns
  985. for i,na in enumerate(newalts): newalts[i] = na+addns
  986. a = ref[l:] # new deletion, deleting pos...pos+l
  987. except ValueError:
  988. s = a[1:]
  989. if len(ref) < len(s): # add Ns to reference if necessary
  990. addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
  991. if not s.endswith(addns) and addns != 'N'*len(addns):
  992. self.error(str(record),self.V33_UNMATCHED_DELETION,
  993. "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
  994. ref += addns
  995. for i,na in enumerate(newalts): newalts[i] = na+addns
  996. a = ref[len(s):] # new deletion, deleting from pos
  997. else:
  998. self.error(str(record),self.V33_BAD_ALLELE)
  999. newalts.append(a)
  1000. alt = newalts
  1001. # deletion alleles exist, add dummy 1st reference allele, and account for leading base
  1002. if have_deletions:
  1003. if pos == 0:
  1004. # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
  1005. addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
  1006. ref += addn
  1007. alt = [allele+addn for allele in alt]
  1008. else:
  1009. addn = get_sequence(chrom,pos-1,pos,self._reference)
  1010. ref = addn + ref
  1011. alt = [addn + allele for allele in alt]
  1012. pos -= 1
  1013. else:
  1014. # format v4.0 -- just check for nucleotides
  1015. for allele in alt:
  1016. if not alleleRegEx.match(allele):
  1017. self.error(str(record),self.V40_BAD_ALLELE,allele)
  1018. # check for leading nucleotide in indel calls
  1019. for allele in alt:
  1020. if len(allele) != len(ref):
  1021. if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE)
  1022. if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
  1023. self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE)
  1024. # trim trailing bases in alleles
  1025. # AH: not certain why trimming this needs to be added
  1026. # disabled now for unit testing
  1027. # for i in range(1,min(len(ref),min(map(len,alt)))):
  1028. # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
  1029. # break
  1030. # ref, alt = ref[:-1], [allele[:-1] for allele in alt]
  1031. # left-align alleles, if a reference is available
  1032. if self._leftalign and self._reference:
  1033. while left < pos:
  1034. movable = True
  1035. for allele in alt:
  1036. if len(allele) > len(ref):
  1037. longest, shortest = allele, ref
  1038. else:
  1039. longest, shortest = ref, allele
  1040. if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
  1041. movable = False
  1042. if longest[-1].upper() != longest[len(shortest)-1].upper():
  1043. movable = False
  1044. if not movable:
  1045. break
  1046. ref = ref[:-1]
  1047. alt = [allele[:-1] for allele in alt]
  1048. if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
  1049. ref = faref_leftflank[pos-left-1] + ref
  1050. alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
  1051. pos -= 1
  1052. __all__ = [
  1053. "VCF", "VCFRecord", ]