You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

451 lines
14 KiB

  1. # cython: language_level=3
  2. import types
  3. import sys
  4. import string
  5. import re
  6. import tempfile
  7. import os
  8. import io
  9. from contextlib import contextmanager
  10. from codecs import register_error
  11. from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION
  12. from cpython cimport PyBytes_Check, PyUnicode_Check
  13. from cpython cimport array as c_array
  14. from libc.errno cimport errno
  15. from libc.stdlib cimport calloc, free
  16. from libc.string cimport strerror, strncpy
  17. from libc.stdint cimport INT32_MAX, int32_t
  18. from libc.stdio cimport fprintf, stderr, fflush
  19. from libc.stdio cimport stdout as c_stdout
  20. from posix.fcntl cimport open as c_open, O_WRONLY, O_CREAT, O_TRUNC
  21. from posix.unistd cimport dup as c_dup, SEEK_SET, SEEK_CUR, SEEK_END, STDOUT_FILENO
  22. from pysam.libcsamtools cimport samtools_dispatch, samtools_set_stdout, samtools_set_stderr, \
  23. samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn
  24. from pysam.libcbcftools cimport bcftools_dispatch, bcftools_set_stdout, bcftools_set_stderr, \
  25. bcftools_close_stdout, bcftools_close_stderr, bcftools_set_stdout_fn
  26. #####################################################################
  27. # hard-coded constants
  28. cdef int MAX_POS = (1 << 31) - 1
  29. #################################################################
  30. # Utility functions for quality string conversions
  31. cpdef c_array.array qualitystring_to_array(input_str, int offset=33):
  32. """convert a qualitystring to an array of quality values."""
  33. if input_str is None:
  34. return None
  35. qs = force_bytes(input_str)
  36. cdef char i
  37. return c_array.array('B', [i - offset for i in qs])
  38. cpdef array_to_qualitystring(c_array.array qualities, int offset=33):
  39. """convert an array of quality values to a string."""
  40. if qualities is None:
  41. return None
  42. cdef int x
  43. cdef c_array.array result
  44. result = c_array.clone(qualities, len(qualities), zero=False)
  45. for x from 0 <= x < len(qualities):
  46. result[x] = qualities[x] + offset
  47. return force_str(result.tobytes())
  48. cpdef qualities_to_qualitystring(qualities, int offset=33):
  49. """convert a list or array of quality scores to the string
  50. representation used in the SAM format.
  51. Parameters
  52. ----------
  53. offset : int
  54. offset to be added to the quality scores to arrive at
  55. the characters of the quality string (default=33).
  56. Returns
  57. -------
  58. string
  59. a quality string
  60. """
  61. cdef char x
  62. if qualities is None:
  63. return None
  64. elif isinstance(qualities, c_array.array):
  65. return array_to_qualitystring(qualities, offset=offset)
  66. else:
  67. # tuples and lists
  68. return force_str("".join([chr(x + offset) for x in qualities]))
  69. ########################################################################
  70. ## String encoding configuration facilities
  71. ########################################################################
  72. # Codec error handler that just interprets each bad byte as ISO-8859-1.
  73. def latin1_replace(exception):
  74. return (chr(exception.object[exception.start]), exception.end)
  75. register_error('pysam.latin1replace', latin1_replace)
  76. cdef str ERROR_HANDLER = 'strict'
  77. cpdef get_encoding_error_handler():
  78. return ERROR_HANDLER
  79. cpdef set_encoding_error_handler(name):
  80. global ERROR_HANDLER
  81. previous = ERROR_HANDLER
  82. ERROR_HANDLER = name
  83. return previous
  84. ########################################################################
  85. ## Python 3 compatibility functions
  86. ########################################################################
  87. cdef from_string_and_size(const char* s, size_t length):
  88. return s[:length].decode('utf-8', ERROR_HANDLER)
  89. # filename encoding (adapted from lxml.etree.pyx)
  90. cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii'
  91. cdef str TEXT_ENCODING = 'utf-8'
  92. cdef bytes encode_filename(object filename):
  93. """Make sure a filename is 8-bit encoded (or None)."""
  94. if filename is None:
  95. return None
  96. return os.fsencode(filename)
  97. cdef bytes force_bytes(object s, encoding=None, errors=None):
  98. """convert string or unicode object to bytes, assuming
  99. utf8 encoding.
  100. """
  101. if s is None:
  102. return None
  103. elif PyBytes_Check(s):
  104. return s
  105. elif PyUnicode_Check(s):
  106. return s.encode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
  107. else:
  108. raise TypeError("Argument must be string, bytes or unicode.")
  109. cdef charptr_to_str(const char* s, encoding=None, errors=None):
  110. if s == NULL:
  111. return None
  112. return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
  113. cdef charptr_to_str_w_len(const char* s, size_t n, encoding=None, errors=None):
  114. if s == NULL:
  115. return None
  116. return s[:n].decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
  117. cdef bytes charptr_to_bytes(const char* s, encoding=None, errors=None):
  118. if s == NULL:
  119. return None
  120. else:
  121. return s
  122. cdef force_str(object s, encoding=None, errors=None):
  123. """Return s converted to str type of current Python
  124. (bytes in Py2, unicode in Py3)"""
  125. if s is None:
  126. return None
  127. if PyBytes_Check(s):
  128. return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
  129. # assume unicode
  130. return s
  131. cdef decode_bytes(bytes s, encoding=None, errors=None):
  132. """Return s converted to current Python's str type,
  133. always decoding even in Python 2"""
  134. if s is None:
  135. return None
  136. else:
  137. return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
  138. cdef OSError_from_errno(message, filename=None):
  139. cdef int err = errno
  140. if filename is not None: filename = os.fsdecode(filename)
  141. return OSError(err, f"{message}: {strerror(err).decode()}", filename)
  142. cpdef parse_region(contig=None,
  143. start=None,
  144. stop=None,
  145. region=None,
  146. reference=None,
  147. end=None):
  148. """parse alternative ways to specify a genomic region. A region can
  149. either be specified by :term:`reference`, `start` and
  150. `end`. `start` and `end` denote 0-based, half-open intervals.
  151. :term:`reference` and `end` are also accepted for backward
  152. compatibility as synonyms for :term:`contig` and `stop`,
  153. respectively.
  154. Alternatively, a samtools :term:`region` string can be supplied.
  155. If any of the coordinates are missing they will be replaced by the
  156. minimum (`start`) or maximum (`end`) coordinate.
  157. Note that region strings are 1-based, while `start` and `end`
  158. denote an interval in python coordinates.
  159. Returns
  160. -------
  161. tuple : a tuple of `reference`, `start` and `end`.
  162. Raises
  163. ------
  164. ValueError
  165. for invalid or out of bounds regions.
  166. """
  167. cdef int32_t rstart
  168. cdef int32_t rstop
  169. if reference is not None:
  170. if contig is not None:
  171. raise ValueError('contig and reference should not both be specified')
  172. contig = reference
  173. if contig is not None and region is not None:
  174. raise ValueError('contig/reference and region should not both be specified')
  175. if end is not None:
  176. if stop is not None:
  177. raise ValueError('stop and end should not both be specified')
  178. stop = end
  179. if contig is None and region is None:
  180. raise ValueError("neither contig nor region are given")
  181. rstart = 0
  182. rstop = MAX_POS
  183. if start is not None:
  184. try:
  185. rstart = start
  186. except OverflowError:
  187. raise ValueError('start out of range (%i)' % start)
  188. if stop is not None:
  189. try:
  190. rstop = stop
  191. except OverflowError:
  192. raise ValueError('stop out of range (%i)' % stop)
  193. if region:
  194. if ":" in region:
  195. contig, coord = region.split(":")
  196. parts = coord.split("-")
  197. rstart = int(parts[0]) - 1
  198. if len(parts) >= 1:
  199. rstop = int(parts[1])
  200. else:
  201. contig = region
  202. if rstart > rstop:
  203. raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop))
  204. if not 0 <= rstart < MAX_POS:
  205. raise ValueError('start out of range (%i)' % rstart)
  206. if not 0 <= rstop <= MAX_POS:
  207. raise ValueError('stop out of range (%i)' % rstop)
  208. return contig, rstart, rstop
  209. cdef int libc_whence_from_io(int whence):
  210. # io.SEEK_SET/_CUR/_END are by definition 0/1/2 but C/POSIX's equivalents
  211. # have unspecified values. So we must translate, but checking for 0/1/2
  212. # rather than io.SEEK_SET/etc suffices.
  213. if whence == 0: return SEEK_SET
  214. if whence == 1: return SEEK_CUR
  215. if whence == 2: return SEEK_END
  216. return whence # Otherwise likely invalid, but let HTSlib or OS report it
  217. def _pysam_dispatch(collection,
  218. method,
  219. args=None,
  220. catch_stdout=True,
  221. is_usage=False,
  222. save_stdout=None):
  223. '''call ``method`` in samtools/bcftools providing arguments in args.
  224. By default, stdout is redirected to a temporary file using the patched
  225. C sources except for a few commands that have an explicit output option
  226. (typically: -o). In these commands (such as samtools view), this explicit
  227. option is used. If *is_usage* is True, then these explicit output options
  228. will not be used.
  229. Catching of stdout can be turned off by setting *catch_stdout* to
  230. False.
  231. '''
  232. if method == "index" and args:
  233. # We make sure that at least the first specified input file exists,
  234. # and if it doesn't we raise an IOError.
  235. ARGUMENTS = ['-m', '--min-shift', '-o', '--output', '--output-file', '-@', '--threads']
  236. skip_next = False
  237. for arg in args:
  238. if skip_next:
  239. skip_next = False
  240. continue
  241. if arg.startswith('-'):
  242. # Skip next argument for e.g. '--min-shift' '12' or '-m' '12' but not '-m12'
  243. if arg in ARGUMENTS:
  244. skip_next = True
  245. continue
  246. if not os.path.exists(arg):
  247. raise IOError("No such file or directory: '%s'" % arg)
  248. else:
  249. break
  250. if args is None:
  251. args = []
  252. else:
  253. args = list(args)
  254. # redirect stderr to file
  255. stderr_h, stderr_f = tempfile.mkstemp()
  256. # redirect stdout to file
  257. if save_stdout:
  258. stdout_f = save_stdout
  259. stdout_h = c_open(force_bytes(stdout_f),
  260. O_WRONLY|O_CREAT|O_TRUNC, 0666)
  261. if stdout_h == -1:
  262. raise OSError_from_errno("Could not redirect standard output", stdout_f)
  263. samtools_set_stdout_fn(force_bytes(stdout_f))
  264. bcftools_set_stdout_fn(force_bytes(stdout_f))
  265. elif catch_stdout:
  266. stdout_h, stdout_f = tempfile.mkstemp()
  267. MAP_STDOUT_OPTIONS = {
  268. "samtools": {
  269. "view": "-o {}",
  270. "mpileup": "-o {}",
  271. "depad": "-o {}",
  272. "calmd": "", # uses pysam_stdout_fn
  273. },
  274. "bcftools": {}
  275. }
  276. stdout_option = None
  277. if collection == "bcftools":
  278. # in bcftools, most methods accept -o, the exceptions
  279. # are below:
  280. if method not in ("head", "index", "roh", "stats"):
  281. stdout_option = "-o {}"
  282. elif method in MAP_STDOUT_OPTIONS[collection]:
  283. # special case - samtools view -c outputs on stdout
  284. if not(method == "view" and "-c" in args):
  285. stdout_option = MAP_STDOUT_OPTIONS[collection][method]
  286. if stdout_option is not None and not is_usage:
  287. os.close(stdout_h)
  288. samtools_set_stdout_fn(force_bytes(stdout_f))
  289. bcftools_set_stdout_fn(force_bytes(stdout_f))
  290. args.extend(stdout_option.format(stdout_f).split(" "))
  291. stdout_h = c_open(b"/dev/null", O_WRONLY)
  292. else:
  293. samtools_set_stdout_fn("-")
  294. bcftools_set_stdout_fn("-")
  295. if catch_stdout is None: stdout_h = c_dup(STDOUT_FILENO)
  296. else: stdout_h = c_open(b"/dev/null", O_WRONLY)
  297. # setup the function call to samtools/bcftools main
  298. cdef char ** cargs
  299. cdef int i, n, retval, l
  300. n = len(args)
  301. method = force_bytes(method)
  302. collection = force_bytes(collection)
  303. args = [force_bytes(a) for a in args]
  304. # allocate two more for first (dummy) argument (contains command)
  305. cdef int extra_args = 0
  306. if method == b"index":
  307. extra_args = 1
  308. # add extra arguments for commands accepting optional arguments
  309. # such as 'samtools index x.bam [out.index]'
  310. cargs = <char**>calloc(n + 2 + extra_args, sizeof(char *))
  311. cargs[0] = collection
  312. cargs[1] = method
  313. # create copies of strings - getopt for long options permutes
  314. # arguments
  315. for i from 0 <= i < n:
  316. l = len(args[i])
  317. cargs[i + 2] = <char *>calloc(l + 1, sizeof(char))
  318. strncpy(cargs[i + 2], args[i], l)
  319. # call samtools/bcftools
  320. if collection == b"samtools":
  321. samtools_set_stdout(stdout_h)
  322. samtools_set_stderr(stderr_h)
  323. retval = samtools_dispatch(n + 2, cargs)
  324. samtools_close_stdout()
  325. samtools_close_stderr()
  326. elif collection == b"bcftools":
  327. bcftools_set_stdout(stdout_h)
  328. bcftools_set_stderr(stderr_h)
  329. retval = bcftools_dispatch(n + 2, cargs)
  330. bcftools_close_stdout()
  331. bcftools_close_stderr()
  332. else:
  333. # unknown -- just return a Unix shell's "command not found" exit status
  334. retval = 127
  335. for i from 0 <= i < n:
  336. free(cargs[i + 2])
  337. free(cargs)
  338. # get error messages
  339. def _collect(fn):
  340. out = []
  341. try:
  342. with open(fn, "r") as inf:
  343. out = inf.read()
  344. except UnicodeDecodeError:
  345. with open(fn, "rb") as inf:
  346. # read binary output
  347. out = inf.read()
  348. finally:
  349. os.remove(fn)
  350. return out
  351. out_stderr = _collect(stderr_f)
  352. if save_stdout:
  353. out_stdout = None
  354. elif catch_stdout:
  355. out_stdout = _collect(stdout_f)
  356. else:
  357. out_stdout = None
  358. return retval, out_stderr, out_stdout
  359. __all__ = [
  360. "qualitystring_to_array",
  361. "array_to_qualitystring",
  362. "qualities_to_qualitystring",
  363. "get_encoding_error_handler",
  364. "set_encoding_error_handler",
  365. ]