You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

1912 lines
70 KiB

  1. from __future__ import with_statement, absolute_import, print_function
  2. from six import (
  3. binary_type,
  4. text_type,
  5. PY3,
  6. )
  7. from .decoders import *
  8. from .exceptions import *
  9. try:
  10. from urlparse import parse_qs
  11. except ImportError:
  12. from urllib.parse import parse_qs
  13. import os
  14. import re
  15. import sys
  16. import shutil
  17. import logging
  18. import tempfile
  19. from io import BytesIO
  20. from numbers import Number
  21. # Unique missing object.
  22. _missing = object()
  23. # States for the querystring parser.
  24. STATE_BEFORE_FIELD = 0
  25. STATE_FIELD_NAME = 1
  26. STATE_FIELD_DATA = 2
  27. # States for the multipart parser
  28. STATE_START = 0
  29. STATE_START_BOUNDARY = 1
  30. STATE_HEADER_FIELD_START = 2
  31. STATE_HEADER_FIELD = 3
  32. STATE_HEADER_VALUE_START = 4
  33. STATE_HEADER_VALUE = 5
  34. STATE_HEADER_VALUE_ALMOST_DONE = 6
  35. STATE_HEADERS_ALMOST_DONE = 7
  36. STATE_PART_DATA_START = 8
  37. STATE_PART_DATA = 9
  38. STATE_PART_DATA_END = 10
  39. STATE_END = 11
  40. STATES = [
  41. "START",
  42. "START_BOUNDARY", "HEADER_FEILD_START", "HEADER_FIELD", "HEADER_VALUE_START", "HEADER_VALUE",
  43. "HEADER_VALUE_ALMOST_DONE", "HEADRES_ALMOST_DONE", "PART_DATA_START", "PART_DATA", "PART_DATA_END", "END"
  44. ]
  45. # Flags for the multipart parser.
  46. FLAG_PART_BOUNDARY = 1
  47. FLAG_LAST_BOUNDARY = 2
  48. # Get constants. Since iterating over a str on Python 2 gives you a 1-length
  49. # string, but iterating over a bytes object on Python 3 gives you an integer,
  50. # we need to save these constants.
  51. CR = b'\r'[0]
  52. LF = b'\n'[0]
  53. COLON = b':'[0]
  54. SPACE = b' '[0]
  55. HYPHEN = b'-'[0]
  56. AMPERSAND = b'&'[0]
  57. SEMICOLON = b';'[0]
  58. LOWER_A = b'a'[0]
  59. LOWER_Z = b'z'[0]
  60. NULL = b'\x00'[0]
  61. # Lower-casing a character is different, because of the difference between
  62. # str on Py2, and bytes on Py3. Same with getting the ordinal value of a byte,
  63. # and joining a list of bytes together.
  64. # These functions abstract that.
  65. if PY3: # pragma: no cover
  66. lower_char = lambda c: c | 0x20
  67. ord_char = lambda c: c
  68. join_bytes = lambda b: bytes(list(b))
  69. else: # pragma: no cover
  70. lower_char = lambda c: c.lower()
  71. ord_char = lambda c: ord(c)
  72. join_bytes = lambda b: b''.join(list(b))
  73. # These are regexes for parsing header values.
  74. SPECIAL_CHARS = re.escape(b'()<>@,;:\\"/[]?={} \t')
  75. QUOTED_STR = br'"(?:\\.|[^"])*"'
  76. VALUE_STR = br'(?:[^' + SPECIAL_CHARS + br']+|' + QUOTED_STR + br')'
  77. OPTION_RE_STR = (
  78. br'(?:;|^)\s*([^' + SPECIAL_CHARS + br']+)\s*=\s*(' + VALUE_STR + br')'
  79. )
  80. OPTION_RE = re.compile(OPTION_RE_STR)
  81. QUOTE = b'"'[0]
  82. def parse_options_header(value):
  83. """
  84. Parses a Content-Type header into a value in the following format:
  85. (content_type, {parameters})
  86. """
  87. if not value:
  88. return (b'', {})
  89. # If we are passed a string, we assume that it conforms to WSGI and does
  90. # not contain any code point that's not in latin-1.
  91. if isinstance(value, text_type): # pragma: no cover
  92. value = value.encode('latin-1')
  93. # If we have no options, return the string as-is.
  94. if b';' not in value:
  95. return (value.lower().strip(), {})
  96. # Split at the first semicolon, to get our value and then options.
  97. ctype, rest = value.split(b';', 1)
  98. options = {}
  99. # Parse the options.
  100. for match in OPTION_RE.finditer(rest):
  101. key = match.group(1).lower()
  102. value = match.group(2)
  103. if value[0] == QUOTE and value[-1] == QUOTE:
  104. # Unquote the value.
  105. value = value[1:-1]
  106. value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"')
  107. # If the value is a filename, we need to fix a bug on IE6 that sends
  108. # the full file path instead of the filename.
  109. if key == b'filename':
  110. if value[1:3] == b':\\' or value[:2] == b'\\\\':
  111. value = value.split(b'\\')[-1]
  112. options[key] = value
  113. return ctype, options
  114. class Field(object):
  115. """A Field object represents a (parsed) form field. It represents a single
  116. field with a corresponding name and value.
  117. The name that a :class:`Field` will be instantiated with is the same name
  118. that would be found in the following HTML::
  119. <input name="name_goes_here" type="text"/>
  120. This class defines two methods, :meth:`on_data` and :meth:`on_end`, that
  121. will be called when data is written to the Field, and when the Field is
  122. finalized, respectively.
  123. :param name: the name of the form field
  124. """
  125. def __init__(self, name):
  126. self._name = name
  127. self._value = []
  128. # We cache the joined version of _value for speed.
  129. self._cache = _missing
  130. @classmethod
  131. def from_value(klass, name, value):
  132. """Create an instance of a :class:`Field`, and set the corresponding
  133. value - either None or an actual value. This method will also
  134. finalize the Field itself.
  135. :param name: the name of the form field
  136. :param value: the value of the form field - either a bytestring or
  137. None
  138. """
  139. f = klass(name)
  140. if value is None:
  141. f.set_none()
  142. else:
  143. f.write(value)
  144. f.finalize()
  145. return f
  146. def write(self, data):
  147. """Write some data into the form field.
  148. :param data: a bytestring
  149. """
  150. return self.on_data(data)
  151. def on_data(self, data):
  152. """This method is a callback that will be called whenever data is
  153. written to the Field.
  154. :param data: a bytestring
  155. """
  156. self._value.append(data)
  157. self._cache = _missing
  158. return len(data)
  159. def on_end(self):
  160. """This method is called whenever the Field is finalized.
  161. """
  162. if self._cache is _missing:
  163. self._cache = b''.join(self._value)
  164. def finalize(self):
  165. """Finalize the form field.
  166. """
  167. self.on_end()
  168. def close(self):
  169. """Close the Field object. This will free any underlying cache.
  170. """
  171. # Free our value array.
  172. if self._cache is _missing:
  173. self._cache = b''.join(self._value)
  174. del self._value
  175. def set_none(self):
  176. """Some fields in a querystring can possibly have a value of None - for
  177. example, the string "foo&bar=&baz=asdf" will have a field with the
  178. name "foo" and value None, one with name "bar" and value "", and one
  179. with name "baz" and value "asdf". Since the write() interface doesn't
  180. support writing None, this function will set the field value to None.
  181. """
  182. self._cache = None
  183. @property
  184. def field_name(self):
  185. """This property returns the name of the field."""
  186. return self._name
  187. @property
  188. def value(self):
  189. """This property returns the value of the form field."""
  190. if self._cache is _missing:
  191. self._cache = b''.join(self._value)
  192. return self._cache
  193. def __eq__(self, other):
  194. if isinstance(other, Field):
  195. return (
  196. self.field_name == other.field_name and
  197. self.value == other.value
  198. )
  199. else:
  200. return NotImplemented
  201. def __repr__(self):
  202. if len(self.value) > 97:
  203. # We get the repr, and then insert three dots before the final
  204. # quote.
  205. v = repr(self.value[:97])[:-1] + "...'"
  206. else:
  207. v = repr(self.value)
  208. return "%s(field_name=%r, value=%s)" % (
  209. self.__class__.__name__,
  210. self.field_name,
  211. v
  212. )
  213. class File(object):
  214. """This class represents an uploaded file. It handles writing file data to
  215. either an in-memory file or a temporary file on-disk, if the optional
  216. threshold is passed.
  217. There are some options that can be passed to the File to change behavior
  218. of the class. Valid options are as follows:
  219. .. list-table::
  220. :widths: 15 5 5 30
  221. :header-rows: 1
  222. * - Name
  223. - Type
  224. - Default
  225. - Description
  226. * - UPLOAD_DIR
  227. - `str`
  228. - None
  229. - The directory to store uploaded files in. If this is None, a
  230. temporary file will be created in the system's standard location.
  231. * - UPLOAD_DELETE_TMP
  232. - `bool`
  233. - True
  234. - Delete automatically created TMP file
  235. * - UPLOAD_KEEP_FILENAME
  236. - `bool`
  237. - False
  238. - Whether or not to keep the filename of the uploaded file. If True,
  239. then the filename will be converted to a safe representation (e.g.
  240. by removing any invalid path segments), and then saved with the
  241. same name). Otherwise, a temporary name will be used.
  242. * - UPLOAD_KEEP_EXTENSIONS
  243. - `bool`
  244. - False
  245. - Whether or not to keep the uploaded file's extension. If False, the
  246. file will be saved with the default temporary extension (usually
  247. ".tmp"). Otherwise, the file's extension will be maintained. Note
  248. that this will properly combine with the UPLOAD_KEEP_FILENAME
  249. setting.
  250. * - MAX_MEMORY_FILE_SIZE
  251. - `int`
  252. - 1 MiB
  253. - The maximum number of bytes of a File to keep in memory. By
  254. default, the contents of a File are kept into memory until a certain
  255. limit is reached, after which the contents of the File are written
  256. to a temporary file. This behavior can be disabled by setting this
  257. value to an appropriately large value (or, for example, infinity,
  258. such as `float('inf')`.
  259. :param file_name: The name of the file that this :class:`File` represents
  260. :param field_name: The field name that uploaded this file. Note that this
  261. can be None, if, for example, the file was uploaded
  262. with Content-Type application/octet-stream
  263. :param config: The configuration for this File. See above for valid
  264. configuration keys and their corresponding values.
  265. """
  266. def __init__(self, file_name, field_name=None, config={}):
  267. # Save configuration, set other variables default.
  268. self.logger = logging.getLogger(__name__)
  269. self._config = config
  270. self._in_memory = True
  271. self._bytes_written = 0
  272. self._fileobj = BytesIO()
  273. # Save the provided field/file name.
  274. self._field_name = field_name
  275. self._file_name = file_name
  276. # Our actual file name is None by default, since, depending on our
  277. # config, we may not actually use the provided name.
  278. self._actual_file_name = None
  279. # Split the extension from the filename.
  280. if file_name is not None:
  281. base, ext = os.path.splitext(file_name)
  282. self._file_base = base
  283. self._ext = ext
  284. @property
  285. def field_name(self):
  286. """The form field associated with this file. May be None if there isn't
  287. one, for example when we have an application/octet-stream upload.
  288. """
  289. return self._field_name
  290. @property
  291. def file_name(self):
  292. """The file name given in the upload request.
  293. """
  294. return self._file_name
  295. @property
  296. def actual_file_name(self):
  297. """The file name that this file is saved as. Will be None if it's not
  298. currently saved on disk.
  299. """
  300. return self._actual_file_name
  301. @property
  302. def file_object(self):
  303. """The file object that we're currently writing to. Note that this
  304. will either be an instance of a :class:`io.BytesIO`, or a regular file
  305. object.
  306. """
  307. return self._fileobj
  308. @property
  309. def size(self):
  310. """The total size of this file, counted as the number of bytes that
  311. currently have been written to the file.
  312. """
  313. return self._bytes_written
  314. @property
  315. def in_memory(self):
  316. """A boolean representing whether or not this file object is currently
  317. stored in-memory or on-disk.
  318. """
  319. return self._in_memory
  320. def flush_to_disk(self):
  321. """If the file is already on-disk, do nothing. Otherwise, copy from
  322. the in-memory buffer to a disk file, and then reassign our internal
  323. file object to this new disk file.
  324. Note that if you attempt to flush a file that is already on-disk, a
  325. warning will be logged to this module's logger.
  326. """
  327. if not self._in_memory:
  328. self.logger.warning(
  329. "Trying to flush to disk when we're not in memory"
  330. )
  331. return
  332. # Go back to the start of our file.
  333. self._fileobj.seek(0)
  334. # Open a new file.
  335. new_file = self._get_disk_file()
  336. # Copy the file objects.
  337. shutil.copyfileobj(self._fileobj, new_file)
  338. # Seek to the new position in our new file.
  339. new_file.seek(self._bytes_written)
  340. # Reassign the fileobject.
  341. old_fileobj = self._fileobj
  342. self._fileobj = new_file
  343. # We're no longer in memory.
  344. self._in_memory = False
  345. # Close the old file object.
  346. old_fileobj.close()
  347. def _get_disk_file(self):
  348. """This function is responsible for getting a file object on-disk for us.
  349. """
  350. self.logger.info("Opening a file on disk")
  351. file_dir = self._config.get('UPLOAD_DIR')
  352. keep_filename = self._config.get('UPLOAD_KEEP_FILENAME', False)
  353. keep_extensions = self._config.get('UPLOAD_KEEP_EXTENSIONS', False)
  354. delete_tmp = self._config.get('UPLOAD_DELETE_TMP', True)
  355. # If we have a directory and are to keep the filename...
  356. if file_dir is not None and keep_filename:
  357. self.logger.info("Saving with filename in: %r", file_dir)
  358. # Build our filename.
  359. # TODO: what happens if we don't have a filename?
  360. fname = self._file_base
  361. if keep_extensions:
  362. fname = fname + self._ext
  363. path = os.path.join(file_dir, fname)
  364. try:
  365. self.logger.info("Opening file: %r", path)
  366. tmp_file = open(path, 'w+b')
  367. except (IOError, OSError) as e:
  368. tmp_file = None
  369. self.logger.exception("Error opening temporary file")
  370. raise FileError("Error opening temporary file: %r" % path)
  371. else:
  372. # Build options array.
  373. # Note that on Python 3, tempfile doesn't support byte names. We
  374. # encode our paths using the default filesystem encoding.
  375. options = {}
  376. if keep_extensions:
  377. ext = self._ext
  378. if isinstance(ext, binary_type):
  379. ext = ext.decode(sys.getfilesystemencoding())
  380. options['suffix'] = ext
  381. if file_dir is not None:
  382. d = file_dir
  383. if isinstance(d, binary_type):
  384. d = d.decode(sys.getfilesystemencoding())
  385. options['dir'] = d
  386. options['delete'] = delete_tmp
  387. # Create a temporary (named) file with the appropriate settings.
  388. self.logger.info("Creating a temporary file with options: %r",
  389. options)
  390. try:
  391. tmp_file = tempfile.NamedTemporaryFile(**options)
  392. except (IOError, OSError):
  393. self.logger.exception("Error creating named temporary file")
  394. raise FileError("Error creating named temporary file")
  395. fname = tmp_file.name
  396. # Encode filename as bytes.
  397. if isinstance(fname, text_type):
  398. fname = fname.encode(sys.getfilesystemencoding())
  399. self._actual_file_name = fname
  400. return tmp_file
  401. def write(self, data):
  402. """Write some data to the File.
  403. :param data: a bytestring
  404. """
  405. return self.on_data(data)
  406. def on_data(self, data):
  407. """This method is a callback that will be called whenever data is
  408. written to the File.
  409. :param data: a bytestring
  410. """
  411. pos = self._fileobj.tell()
  412. bwritten = self._fileobj.write(data)
  413. # true file objects write returns None
  414. if bwritten is None:
  415. bwritten = self._fileobj.tell() - pos
  416. # If the bytes written isn't the same as the length, just return.
  417. if bwritten != len(data):
  418. self.logger.warning("bwritten != len(data) (%d != %d)", bwritten,
  419. len(data))
  420. return bwritten
  421. # Keep track of how many bytes we've written.
  422. self._bytes_written += bwritten
  423. # If we're in-memory and are over our limit, we create a file.
  424. if (self._in_memory and
  425. self._config.get('MAX_MEMORY_FILE_SIZE') is not None and
  426. (self._bytes_written >
  427. self._config.get('MAX_MEMORY_FILE_SIZE'))):
  428. self.logger.info("Flushing to disk")
  429. self.flush_to_disk()
  430. # Return the number of bytes written.
  431. return bwritten
  432. def on_end(self):
  433. """This method is called whenever the Field is finalized.
  434. """
  435. # Flush the underlying file object
  436. self._fileobj.flush()
  437. def finalize(self):
  438. """Finalize the form file. This will not close the underlying file,
  439. but simply signal that we are finished writing to the File.
  440. """
  441. self.on_end()
  442. def close(self):
  443. """Close the File object. This will actually close the underlying
  444. file object (whether it's a :class:`io.BytesIO` or an actual file
  445. object).
  446. """
  447. self._fileobj.close()
  448. def __repr__(self):
  449. return "%s(file_name=%r, field_name=%r)" % (
  450. self.__class__.__name__,
  451. self.file_name,
  452. self.field_name
  453. )
  454. class BaseParser(object):
  455. """This class is the base class for all parsers. It contains the logic for
  456. calling and adding callbacks.
  457. A callback can be one of two different forms. "Notification callbacks" are
  458. callbacks that are called when something happens - for example, when a new
  459. part of a multipart message is encountered by the parser. "Data callbacks"
  460. are called when we get some sort of data - for example, part of the body of
  461. a multipart chunk. Notification callbacks are called with no parameters,
  462. whereas data callbacks are called with three, as follows::
  463. data_callback(data, start, end)
  464. The "data" parameter is a bytestring (i.e. "foo" on Python 2, or b"foo" on
  465. Python 3). "start" and "end" are integer indexes into the "data" string
  466. that represent the data of interest. Thus, in a data callback, the slice
  467. `data[start:end]` represents the data that the callback is "interested in".
  468. The callback is not passed a copy of the data, since copying severely hurts
  469. performance.
  470. """
  471. def __init__(self):
  472. self.logger = logging.getLogger(__name__)
  473. def callback(self, name, data=None, start=None, end=None):
  474. """This function calls a provided callback with some data. If the
  475. callback is not set, will do nothing.
  476. :param name: The name of the callback to call (as a string).
  477. :param data: Data to pass to the callback. If None, then it is
  478. assumed that the callback is a notification callback,
  479. and no parameters are given.
  480. :param end: An integer that is passed to the data callback.
  481. :param start: An integer that is passed to the data callback.
  482. """
  483. name = "on_" + name
  484. func = self.callbacks.get(name)
  485. if func is None:
  486. return
  487. # Depending on whether we're given a buffer...
  488. if data is not None:
  489. # Don't do anything if we have start == end.
  490. if start is not None and start == end:
  491. return
  492. self.logger.debug("Calling %s with data[%d:%d]", name, start, end)
  493. func(data, start, end)
  494. else:
  495. self.logger.debug("Calling %s with no data", name)
  496. func()
  497. def set_callback(self, name, new_func):
  498. """Update the function for a callback. Removes from the callbacks dict
  499. if new_func is None.
  500. :param name: The name of the callback to call (as a string).
  501. :param new_func: The new function for the callback. If None, then the
  502. callback will be removed (with no error if it does not
  503. exist).
  504. """
  505. if new_func is None:
  506. self.callbacks.pop('on_' + name, None)
  507. else:
  508. self.callbacks['on_' + name] = new_func
  509. def close(self):
  510. pass # pragma: no cover
  511. def finalize(self):
  512. pass # pragma: no cover
  513. def __repr__(self):
  514. return "%s()" % self.__class__.__name__
  515. class OctetStreamParser(BaseParser):
  516. """This parser parses an octet-stream request body and calls callbacks when
  517. incoming data is received. Callbacks are as follows:
  518. .. list-table::
  519. :widths: 15 10 30
  520. :header-rows: 1
  521. * - Callback Name
  522. - Parameters
  523. - Description
  524. * - on_start
  525. - None
  526. - Called when the first data is parsed.
  527. * - on_data
  528. - data, start, end
  529. - Called for each data chunk that is parsed.
  530. * - on_end
  531. - None
  532. - Called when the parser is finished parsing all data.
  533. :param callbacks: A dictionary of callbacks. See the documentation for
  534. :class:`BaseParser`.
  535. :param max_size: The maximum size of body to parse. Defaults to infinity -
  536. i.e. unbounded.
  537. """
  538. def __init__(self, callbacks={}, max_size=float('inf')):
  539. super(OctetStreamParser, self).__init__()
  540. self.callbacks = callbacks
  541. self._started = False
  542. if not isinstance(max_size, Number) or max_size < 1:
  543. raise ValueError("max_size must be a positive number, not %r" %
  544. max_size)
  545. self.max_size = max_size
  546. self._current_size = 0
  547. def write(self, data):
  548. """Write some data to the parser, which will perform size verification,
  549. and then pass the data to the underlying callback.
  550. :param data: a bytestring
  551. """
  552. if not self._started:
  553. self.callback('start')
  554. self._started = True
  555. # Truncate data length.
  556. data_len = len(data)
  557. if (self._current_size + data_len) > self.max_size:
  558. # We truncate the length of data that we are to process.
  559. new_size = int(self.max_size - self._current_size)
  560. self.logger.warning("Current size is %d (max %d), so truncating "
  561. "data length from %d to %d",
  562. self._current_size, self.max_size, data_len,
  563. new_size)
  564. data_len = new_size
  565. # Increment size, then callback, in case there's an exception.
  566. self._current_size += data_len
  567. self.callback('data', data, 0, data_len)
  568. return data_len
  569. def finalize(self):
  570. """Finalize this parser, which signals to that we are finished parsing,
  571. and sends the on_end callback.
  572. """
  573. self.callback('end')
  574. def __repr__(self):
  575. return "%s()" % self.__class__.__name__
  576. class QuerystringParser(BaseParser):
  577. """This is a streaming querystring parser. It will consume data, and call
  578. the callbacks given when it has data.
  579. .. list-table::
  580. :widths: 15 10 30
  581. :header-rows: 1
  582. * - Callback Name
  583. - Parameters
  584. - Description
  585. * - on_field_start
  586. - None
  587. - Called when a new field is encountered.
  588. * - on_field_name
  589. - data, start, end
  590. - Called when a portion of a field's name is encountered.
  591. * - on_field_data
  592. - data, start, end
  593. - Called when a portion of a field's data is encountered.
  594. * - on_field_end
  595. - None
  596. - Called when the end of a field is encountered.
  597. * - on_end
  598. - None
  599. - Called when the parser is finished parsing all data.
  600. :param callbacks: A dictionary of callbacks. See the documentation for
  601. :class:`BaseParser`.
  602. :param strict_parsing: Whether or not to parse the body strictly. Defaults
  603. to False. If this is set to True, then the behavior
  604. of the parser changes as the following: if a field
  605. has a value with an equal sign (e.g. "foo=bar", or
  606. "foo="), it is always included. If a field has no
  607. equals sign (e.g. "...&name&..."), it will be
  608. treated as an error if 'strict_parsing' is True,
  609. otherwise included. If an error is encountered,
  610. then a
  611. :class:`multipart.exceptions.QuerystringParseError`
  612. will be raised.
  613. :param max_size: The maximum size of body to parse. Defaults to infinity -
  614. i.e. unbounded.
  615. """
  616. def __init__(self, callbacks={}, strict_parsing=False,
  617. max_size=float('inf')):
  618. super(QuerystringParser, self).__init__()
  619. self.state = STATE_BEFORE_FIELD
  620. self._found_sep = False
  621. self.callbacks = callbacks
  622. # Max-size stuff
  623. if not isinstance(max_size, Number) or max_size < 1:
  624. raise ValueError("max_size must be a positive number, not %r" %
  625. max_size)
  626. self.max_size = max_size
  627. self._current_size = 0
  628. # Should parsing be strict?
  629. self.strict_parsing = strict_parsing
  630. def write(self, data):
  631. """Write some data to the parser, which will perform size verification,
  632. parse into either a field name or value, and then pass the
  633. corresponding data to the underlying callback. If an error is
  634. encountered while parsing, a QuerystringParseError will be raised. The
  635. "offset" attribute of the raised exception will be set to the offset in
  636. the input data chunk (NOT the overall stream) that caused the error.
  637. :param data: a bytestring
  638. """
  639. # Handle sizing.
  640. data_len = len(data)
  641. if (self._current_size + data_len) > self.max_size:
  642. # We truncate the length of data that we are to process.
  643. new_size = int(self.max_size - self._current_size)
  644. self.logger.warning("Current size is %d (max %d), so truncating "
  645. "data length from %d to %d",
  646. self._current_size, self.max_size, data_len,
  647. new_size)
  648. data_len = new_size
  649. l = 0
  650. try:
  651. l = self._internal_write(data, data_len)
  652. finally:
  653. self._current_size += l
  654. return l
  655. def _internal_write(self, data, length):
  656. state = self.state
  657. strict_parsing = self.strict_parsing
  658. found_sep = self._found_sep
  659. i = 0
  660. while i < length:
  661. ch = data[i]
  662. # Depending on our state...
  663. if state == STATE_BEFORE_FIELD:
  664. # If the 'found_sep' flag is set, we've already encountered
  665. # and skipped a single seperator. If so, we check our strict
  666. # parsing flag and decide what to do. Otherwise, we haven't
  667. # yet reached a seperator, and thus, if we do, we need to skip
  668. # it as it will be the boundary between fields that's supposed
  669. # to be there.
  670. if ch == AMPERSAND or ch == SEMICOLON:
  671. if found_sep:
  672. # If we're parsing strictly, we disallow blank chunks.
  673. if strict_parsing:
  674. e = QuerystringParseError(
  675. "Skipping duplicate ampersand/semicolon at "
  676. "%d" % i
  677. )
  678. e.offset = i
  679. raise e
  680. else:
  681. self.logger.debug("Skipping duplicate ampersand/"
  682. "semicolon at %d", i)
  683. else:
  684. # This case is when we're skipping the (first)
  685. # seperator between fields, so we just set our flag
  686. # and continue on.
  687. found_sep = True
  688. else:
  689. # Emit a field-start event, and go to that state. Also,
  690. # reset the "found_sep" flag, for the next time we get to
  691. # this state.
  692. self.callback('field_start')
  693. i -= 1
  694. state = STATE_FIELD_NAME
  695. found_sep = False
  696. elif state == STATE_FIELD_NAME:
  697. # Try and find a seperator - we ensure that, if we do, we only
  698. # look for the equal sign before it.
  699. sep_pos = data.find(b'&', i)
  700. if sep_pos == -1:
  701. sep_pos = data.find(b';', i)
  702. # See if we can find an equals sign in the remaining data. If
  703. # so, we can immedately emit the field name and jump to the
  704. # data state.
  705. if sep_pos != -1:
  706. equals_pos = data.find(b'=', i, sep_pos)
  707. else:
  708. equals_pos = data.find(b'=', i)
  709. if equals_pos != -1:
  710. # Emit this name.
  711. self.callback('field_name', data, i, equals_pos)
  712. # Jump i to this position. Note that it will then have 1
  713. # added to it below, which means the next iteration of this
  714. # loop will inspect the character after the equals sign.
  715. i = equals_pos
  716. state = STATE_FIELD_DATA
  717. else:
  718. # No equals sign found.
  719. if not strict_parsing:
  720. # See also comments in the STATE_FIELD_DATA case below.
  721. # If we found the seperator, we emit the name and just
  722. # end - there's no data callback at all (not even with
  723. # a blank value).
  724. if sep_pos != -1:
  725. self.callback('field_name', data, i, sep_pos)
  726. self.callback('field_end')
  727. i = sep_pos - 1
  728. state = STATE_BEFORE_FIELD
  729. else:
  730. # Otherwise, no seperator in this block, so the
  731. # rest of this chunk must be a name.
  732. self.callback('field_name', data, i, length)
  733. i = length
  734. else:
  735. # We're parsing strictly. If we find a seperator,
  736. # this is an error - we require an equals sign.
  737. if sep_pos != -1:
  738. e = QuerystringParseError(
  739. "When strict_parsing is True, we require an "
  740. "equals sign in all field chunks. Did not "
  741. "find one in the chunk that starts at %d" %
  742. (i,)
  743. )
  744. e.offset = i
  745. raise e
  746. # No seperator in the rest of this chunk, so it's just
  747. # a field name.
  748. self.callback('field_name', data, i, length)
  749. i = length
  750. elif state == STATE_FIELD_DATA:
  751. # Try finding either an ampersand or a semicolon after this
  752. # position.
  753. sep_pos = data.find(b'&', i)
  754. if sep_pos == -1:
  755. sep_pos = data.find(b';', i)
  756. # If we found it, callback this bit as data and then go back
  757. # to expecting to find a field.
  758. if sep_pos != -1:
  759. self.callback('field_data', data, i, sep_pos)
  760. self.callback('field_end')
  761. # Note that we go to the seperator, which brings us to the
  762. # "before field" state. This allows us to properly emit
  763. # "field_start" events only when we actually have data for
  764. # a field of some sort.
  765. i = sep_pos - 1
  766. state = STATE_BEFORE_FIELD
  767. # Otherwise, emit the rest as data and finish.
  768. else:
  769. self.callback('field_data', data, i, length)
  770. i = length
  771. else: # pragma: no cover (error case)
  772. msg = "Reached an unknown state %d at %d" % (state, i)
  773. self.logger.warning(msg)
  774. e = QuerystringParseError(msg)
  775. e.offset = i
  776. raise e
  777. i += 1
  778. self.state = state
  779. self._found_sep = found_sep
  780. return len(data)
  781. def finalize(self):
  782. """Finalize this parser, which signals to that we are finished parsing,
  783. if we're still in the middle of a field, an on_field_end callback, and
  784. then the on_end callback.
  785. """
  786. # If we're currently in the middle of a field, we finish it.
  787. if self.state == STATE_FIELD_DATA:
  788. self.callback('field_end')
  789. self.callback('end')
  790. def __repr__(self):
  791. return "%s(keep_blank_values=%r, strict_parsing=%r, max_size=%r)" % (
  792. self.__class__.__name__,
  793. self.keep_blank_values, self.strict_parsing, self.max_size
  794. )
  795. class MultipartParser(BaseParser):
  796. """This class is a streaming multipart/form-data parser.
  797. .. list-table::
  798. :widths: 15 10 30
  799. :header-rows: 1
  800. * - Callback Name
  801. - Parameters
  802. - Description
  803. * - on_part_begin
  804. - None
  805. - Called when a new part of the multipart message is encountered.
  806. * - on_part_data
  807. - data, start, end
  808. - Called when a portion of a part's data is encountered.
  809. * - on_part_end
  810. - None
  811. - Called when the end of a part is reached.
  812. * - on_header_begin
  813. - None
  814. - Called when we've found a new header in a part of a multipart
  815. message
  816. * - on_header_field
  817. - data, start, end
  818. - Called each time an additional portion of a header is read (i.e. the
  819. part of the header that is before the colon; the "Foo" in
  820. "Foo: Bar").
  821. * - on_header_value
  822. - data, start, end
  823. - Called when we get data for a header.
  824. * - on_header_end
  825. - None
  826. - Called when the current header is finished - i.e. we've reached the
  827. newline at the end of the header.
  828. * - on_headers_finished
  829. - None
  830. - Called when all headers are finished, and before the part data
  831. starts.
  832. * - on_end
  833. - None
  834. - Called when the parser is finished parsing all data.
  835. :param boundary: The multipart boundary. This is required, and must match
  836. what is given in the HTTP request - usually in the
  837. Content-Type header.
  838. :param callbacks: A dictionary of callbacks. See the documentation for
  839. :class:`BaseParser`.
  840. :param max_size: The maximum size of body to parse. Defaults to infinity -
  841. i.e. unbounded.
  842. """
  843. def __init__(self, boundary, callbacks={}, max_size=float('inf')):
  844. # Initialize parser state.
  845. super(MultipartParser, self).__init__()
  846. self.state = STATE_START
  847. self.index = self.flags = 0
  848. self.callbacks = callbacks
  849. if not isinstance(max_size, Number) or max_size < 1:
  850. raise ValueError("max_size must be a positive number, not %r" %
  851. max_size)
  852. self.max_size = max_size
  853. self._current_size = 0
  854. # Setup marks. These are used to track the state of data recieved.
  855. self.marks = {}
  856. # TODO: Actually use this rather than the dumb version we currently use
  857. # # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
  858. # skip = [len(boundary) for x in range(256)]
  859. # for i in range(len(boundary) - 1):
  860. # skip[ord_char(boundary[i])] = len(boundary) - i - 1
  861. #
  862. # # We use a tuple since it's a constant, and marginally faster.
  863. # self.skip = tuple(skip)
  864. # Save our boundary.
  865. if isinstance(boundary, text_type): # pragma: no cover
  866. boundary = boundary.encode('latin-1')
  867. self.boundary = b'\r\n--' + boundary
  868. # Get a set of characters that belong to our boundary.
  869. self.boundary_chars = frozenset(self.boundary)
  870. # We also create a lookbehind list.
  871. # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary +
  872. # "--\r\n" at the final boundary, and the length of '\r\n--' and
  873. # '--\r\n' is 8 bytes.
  874. self.lookbehind = [NULL for x in range(len(boundary) + 8)]
  875. def write(self, data):
  876. """Write some data to the parser, which will perform size verification,
  877. and then parse the data into the appropriate location (e.g. header,
  878. data, etc.), and pass this on to the underlying callback. If an error
  879. is encountered, a MultipartParseError will be raised. The "offset"
  880. attribute on the raised exception will be set to the offset of the byte
  881. in the input chunk that caused the error.
  882. :param data: a bytestring
  883. """
  884. # Handle sizing.
  885. data_len = len(data)
  886. if (self._current_size + data_len) > self.max_size:
  887. # We truncate the length of data that we are to process.
  888. new_size = int(self.max_size - self._current_size)
  889. self.logger.warning("Current size is %d (max %d), so truncating "
  890. "data length from %d to %d",
  891. self._current_size, self.max_size, data_len,
  892. new_size)
  893. data_len = new_size
  894. l = 0
  895. try:
  896. l = self._internal_write(data, data_len)
  897. finally:
  898. self._current_size += l
  899. return l
  900. def _internal_write(self, data, length):
  901. # Get values from locals.
  902. boundary = self.boundary
  903. # Get our state, flags and index. These are persisted between calls to
  904. # this function.
  905. state = self.state
  906. index = self.index
  907. flags = self.flags
  908. # Our index defaults to 0.
  909. i = 0
  910. # Set a mark.
  911. def set_mark(name):
  912. self.marks[name] = i
  913. # Remove a mark.
  914. def delete_mark(name, reset=False):
  915. self.marks.pop(name, None)
  916. # Helper function that makes calling a callback with data easier. The
  917. # 'remaining' parameter will callback from the marked value until the
  918. # end of the buffer, and reset the mark, instead of deleting it. This
  919. # is used at the end of the function to call our callbacks with any
  920. # remaining data in this chunk.
  921. def data_callback(name, remaining=False):
  922. marked_index = self.marks.get(name)
  923. if marked_index is None:
  924. return
  925. # If we're getting remaining data, we ignore the current i value
  926. # and just call with the remaining data.
  927. if remaining:
  928. self.callback(name, data, marked_index, length)
  929. self.marks[name] = 0
  930. # Otherwise, we call it from the mark to the current byte we're
  931. # processing.
  932. else:
  933. self.callback(name, data, marked_index, i)
  934. self.marks.pop(name, None)
  935. # For each byte...
  936. while i < length:
  937. c = data[i]
  938. if state == STATE_START:
  939. # Skip leading newlines
  940. if c == CR or c == LF:
  941. i += 1
  942. self.logger.debug("Skipping leading CR/LF at %d", i)
  943. continue
  944. # index is used as in index into our boundary. Set to 0.
  945. index = 0
  946. # Move to the next state, but decrement i so that we re-process
  947. # this character.
  948. state = STATE_START_BOUNDARY
  949. i -= 1
  950. elif state == STATE_START_BOUNDARY:
  951. # Check to ensure that the last 2 characters in our boundary
  952. # are CRLF.
  953. if index == len(boundary) - 2:
  954. if c != CR:
  955. # Error!
  956. msg = "Did not find CR at end of boundary (%d)" % (i,)
  957. self.logger.warning(msg)
  958. e = MultipartParseError(msg)
  959. e.offset = i
  960. raise e
  961. index += 1
  962. elif index == len(boundary) - 2 + 1:
  963. if c != LF:
  964. msg = "Did not find LF at end of boundary (%d)" % (i,)
  965. self.logger.warning(msg)
  966. e = MultipartParseError(msg)
  967. e.offset = i
  968. raise e
  969. # The index is now used for indexing into our boundary.
  970. index = 0
  971. # Callback for the start of a part.
  972. self.callback('part_begin')
  973. # Move to the next character and state.
  974. state = STATE_HEADER_FIELD_START
  975. else:
  976. # Check to ensure our boundary matches
  977. if c != boundary[index + 2]:
  978. msg = "Did not find boundary character %r at index " \
  979. "%d" % (c, index + 2)
  980. self.logger.warning(msg)
  981. e = MultipartParseError(msg)
  982. e.offset = i
  983. raise e
  984. # Increment index into boundary and continue.
  985. index += 1
  986. elif state == STATE_HEADER_FIELD_START:
  987. # Mark the start of a header field here, reset the index, and
  988. # continue parsing our header field.
  989. index = 0
  990. # Set a mark of our header field.
  991. set_mark('header_field')
  992. # Move to parsing header fields.
  993. state = STATE_HEADER_FIELD
  994. i -= 1
  995. elif state == STATE_HEADER_FIELD:
  996. # If we've reached a CR at the beginning of a header, it means
  997. # that we've reached the second of 2 newlines, and so there are
  998. # no more headers to parse.
  999. if c == CR:
  1000. delete_mark('header_field')
  1001. state = STATE_HEADERS_ALMOST_DONE
  1002. i += 1
  1003. continue
  1004. # Increment our index in the header.
  1005. index += 1
  1006. # Do nothing if we encounter a hyphen.
  1007. if c == HYPHEN:
  1008. pass
  1009. # If we've reached a colon, we're done with this header.
  1010. elif c == COLON:
  1011. # A 0-length header is an error.
  1012. if index == 1:
  1013. msg = "Found 0-length header at %d" % (i,)
  1014. self.logger.warning(msg)
  1015. e = MultipartParseError(msg)
  1016. e.offset = i
  1017. raise e
  1018. # Call our callback with the header field.
  1019. data_callback('header_field')
  1020. # Move to parsing the header value.
  1021. state = STATE_HEADER_VALUE_START
  1022. else:
  1023. # Lower-case this character, and ensure that it is in fact
  1024. # a valid letter. If not, it's an error.
  1025. cl = lower_char(c)
  1026. if cl < LOWER_A or cl > LOWER_Z:
  1027. msg = "Found non-alphanumeric character %r in " \
  1028. "header at %d" % (c, i)
  1029. self.logger.warning(msg)
  1030. e = MultipartParseError(msg)
  1031. e.offset = i
  1032. raise e
  1033. elif state == STATE_HEADER_VALUE_START:
  1034. # Skip leading spaces.
  1035. if c == SPACE:
  1036. i += 1
  1037. continue
  1038. # Mark the start of the header value.
  1039. set_mark('header_value')
  1040. # Move to the header-value state, reprocessing this character.
  1041. state = STATE_HEADER_VALUE
  1042. i -= 1
  1043. elif state == STATE_HEADER_VALUE:
  1044. # If we've got a CR, we're nearly done our headers. Otherwise,
  1045. # we do nothing and just move past this character.
  1046. if c == CR:
  1047. data_callback('header_value')
  1048. self.callback('header_end')
  1049. state = STATE_HEADER_VALUE_ALMOST_DONE
  1050. elif state == STATE_HEADER_VALUE_ALMOST_DONE:
  1051. # The last character should be a LF. If not, it's an error.
  1052. if c != LF:
  1053. msg = "Did not find LF character at end of header " \
  1054. "(found %r)" % (c,)
  1055. self.logger.warning(msg)
  1056. e = MultipartParseError(msg)
  1057. e.offset = i
  1058. raise e
  1059. # Move back to the start of another header. Note that if that
  1060. # state detects ANOTHER newline, it'll trigger the end of our
  1061. # headers.
  1062. state = STATE_HEADER_FIELD_START
  1063. elif state == STATE_HEADERS_ALMOST_DONE:
  1064. # We're almost done our headers. This is reached when we parse
  1065. # a CR at the beginning of a header, so our next character
  1066. # should be a LF, or it's an error.
  1067. if c != LF:
  1068. msg = "Did not find LF at end of headers (found %r)" % (c,)
  1069. self.logger.warning(msg)
  1070. e = MultipartParseError(msg)
  1071. e.offset = i
  1072. raise e
  1073. self.callback('headers_finished')
  1074. state = STATE_PART_DATA_START
  1075. elif state == STATE_PART_DATA_START:
  1076. # Mark the start of our part data.
  1077. set_mark('part_data')
  1078. # Start processing part data, including this character.
  1079. state = STATE_PART_DATA
  1080. i -= 1
  1081. elif state == STATE_PART_DATA:
  1082. # We're processing our part data right now. During this, we
  1083. # need to efficiently search for our boundary, since any data
  1084. # on any number of lines can be a part of the current data.
  1085. # We use the Boyer-Moore-Horspool algorithm to efficiently
  1086. # search through the remainder of the buffer looking for our
  1087. # boundary.
  1088. # Save the current value of our index. We use this in case we
  1089. # find part of a boundary, but it doesn't match fully.
  1090. prev_index = index
  1091. # Set up variables.
  1092. boundary_length = len(boundary)
  1093. boundary_end = boundary_length - 1
  1094. data_length = length
  1095. boundary_chars = self.boundary_chars
  1096. # If our index is 0, we're starting a new part, so start our
  1097. # search.
  1098. if index == 0:
  1099. # Search forward until we either hit the end of our buffer,
  1100. # or reach a character that's in our boundary.
  1101. i += boundary_end
  1102. while i < data_length - 1 and data[i] not in boundary_chars:
  1103. i += boundary_length
  1104. # Reset i back the length of our boundary, which is the
  1105. # earliest possible location that could be our match (i.e.
  1106. # if we've just broken out of our loop since we saw the
  1107. # last character in our boundary)
  1108. i -= boundary_end
  1109. c = data[i]
  1110. # Now, we have a couple of cases here. If our index is before
  1111. # the end of the boundary...
  1112. if index < boundary_length:
  1113. # If the character matches...
  1114. if boundary[index] == c:
  1115. # If we found a match for our boundary, we send the
  1116. # existing data.
  1117. if index == 0:
  1118. data_callback('part_data')
  1119. # The current character matches, so continue!
  1120. index += 1
  1121. else:
  1122. index = 0
  1123. # Our index is equal to the length of our boundary!
  1124. elif index == boundary_length:
  1125. # First we increment it.
  1126. index += 1
  1127. # Now, if we've reached a newline, we need to set this as
  1128. # the potential end of our boundary.
  1129. if c == CR:
  1130. flags |= FLAG_PART_BOUNDARY
  1131. # Otherwise, if this is a hyphen, we might be at the last
  1132. # of all boundaries.
  1133. elif c == HYPHEN:
  1134. flags |= FLAG_LAST_BOUNDARY
  1135. # Otherwise, we reset our index, since this isn't either a
  1136. # newline or a hyphen.
  1137. else:
  1138. index = 0
  1139. # Our index is right after the part boundary, which should be
  1140. # a LF.
  1141. elif index == boundary_length + 1:
  1142. # If we're at a part boundary (i.e. we've seen a CR
  1143. # character already)...
  1144. if flags & FLAG_PART_BOUNDARY:
  1145. # We need a LF character next.
  1146. if c == LF:
  1147. # Unset the part boundary flag.
  1148. flags &= (~FLAG_PART_BOUNDARY)
  1149. # Callback indicating that we've reached the end of
  1150. # a part, and are starting a new one.
  1151. self.callback('part_end')
  1152. self.callback('part_begin')
  1153. # Move to parsing new headers.
  1154. index = 0
  1155. state = STATE_HEADER_FIELD_START
  1156. i += 1
  1157. continue
  1158. # We didn't find an LF character, so no match. Reset
  1159. # our index and clear our flag.
  1160. index = 0
  1161. flags &= (~FLAG_PART_BOUNDARY)
  1162. # Otherwise, if we're at the last boundary (i.e. we've
  1163. # seen a hyphen already)...
  1164. elif flags & FLAG_LAST_BOUNDARY:
  1165. # We need a second hyphen here.
  1166. if c == HYPHEN:
  1167. # Callback to end the current part, and then the
  1168. # message.
  1169. self.callback('part_end')
  1170. self.callback('end')
  1171. state = STATE_END
  1172. else:
  1173. # No match, so reset index.
  1174. index = 0
  1175. # If we have an index, we need to keep this byte for later, in
  1176. # case we can't match the full boundary.
  1177. if index > 0:
  1178. self.lookbehind[index - 1] = c
  1179. # Otherwise, our index is 0. If the previous index is not, it
  1180. # means we reset something, and we need to take the data we
  1181. # thought was part of our boundary and send it along as actual
  1182. # data.
  1183. elif prev_index > 0:
  1184. # Callback to write the saved data.
  1185. lb_data = join_bytes(self.lookbehind)
  1186. self.callback('part_data', lb_data, 0, prev_index)
  1187. # Overwrite our previous index.
  1188. prev_index = 0
  1189. # Re-set our mark for part data.
  1190. set_mark('part_data')
  1191. # Re-consider the current character, since this could be
  1192. # the start of the boundary itself.
  1193. i -= 1
  1194. elif state == STATE_END:
  1195. # Do nothing and just consume a byte in the end state.
  1196. if c not in (CR, LF):
  1197. self.logger.warning("Consuming a byte '0x%x' in the end state", c)
  1198. else: # pragma: no cover (error case)
  1199. # We got into a strange state somehow! Just stop processing.
  1200. msg = "Reached an unknown state %d at %d" % (state, i)
  1201. self.logger.warning(msg)
  1202. e = MultipartParseError(msg)
  1203. e.offset = i
  1204. raise e
  1205. # Move to the next byte.
  1206. i += 1
  1207. # We call our callbacks with any remaining data. Note that we pass
  1208. # the 'remaining' flag, which sets the mark back to 0 instead of
  1209. # deleting it, if it's found. This is because, if the mark is found
  1210. # at this point, we assume that there's data for one of these things
  1211. # that has been parsed, but not yet emitted. And, as such, it implies
  1212. # that we haven't yet reached the end of this 'thing'. So, by setting
  1213. # the mark to 0, we cause any data callbacks that take place in future
  1214. # calls to this function to start from the beginning of that buffer.
  1215. data_callback('header_field', True)
  1216. data_callback('header_value', True)
  1217. data_callback('part_data', True)
  1218. # Save values to locals.
  1219. self.state = state
  1220. self.index = index
  1221. self.flags = flags
  1222. # Return our data length to indicate no errors, and that we processed
  1223. # all of it.
  1224. return length
  1225. def finalize(self):
  1226. """Finalize this parser, which signals to that we are finished parsing.
  1227. Note: It does not currently, but in the future, it will verify that we
  1228. are in the final state of the parser (i.e. the end of the multipart
  1229. message is well-formed), and, if not, throw an error.
  1230. """
  1231. # TODO: verify that we're in the state STATE_END, otherwise throw an
  1232. # error or otherwise state that we're not finished parsing.
  1233. pass
  1234. def __repr__(self):
  1235. return "%s(boundary=%r)" % (self.__class__.__name__, self.boundary)
  1236. class FormParser(object):
  1237. """This class is the all-in-one form parser. Given all the information
  1238. necessary to parse a form, it will instantiate the correct parser, create
  1239. the proper :class:`Field` and :class:`File` classes to store the data that
  1240. is parsed, and call the two given callbacks with each field and file as
  1241. they become available.
  1242. :param content_type: The Content-Type of the incoming request. This is
  1243. used to select the appropriate parser.
  1244. :param on_field: The callback to call when a field has been parsed and is
  1245. ready for usage. See above for parameters.
  1246. :param on_file: The callback to call when a file has been parsed and is
  1247. ready for usage. See above for parameters.
  1248. :param on_end: An optional callback to call when all fields and files in a
  1249. request has been parsed. Can be None.
  1250. :param boundary: If the request is a multipart/form-data request, this
  1251. should be the boundary of the request, as given in the
  1252. Content-Type header, as a bytestring.
  1253. :param file_name: If the request is of type application/octet-stream, then
  1254. the body of the request will not contain any information
  1255. about the uploaded file. In such cases, you can provide
  1256. the file name of the uploaded file manually.
  1257. :param FileClass: The class to use for uploaded files. Defaults to
  1258. :class:`File`, but you can provide your own class if you
  1259. wish to customize behaviour. The class will be
  1260. instantiated as FileClass(file_name, field_name), and it
  1261. must provide the folllowing functions::
  1262. file_instance.write(data)
  1263. file_instance.finalize()
  1264. file_instance.close()
  1265. :param FieldClass: The class to use for uploaded fields. Defaults to
  1266. :class:`Field`, but you can provide your own class if
  1267. you wish to customize behaviour. The class will be
  1268. instantiated as FieldClass(field_name), and it must
  1269. provide the folllowing functions::
  1270. field_instance.write(data)
  1271. field_instance.finalize()
  1272. field_instance.close()
  1273. :param config: Configuration to use for this FormParser. The default
  1274. values are taken from the DEFAULT_CONFIG value, and then
  1275. any keys present in this dictionary will overwrite the
  1276. default values.
  1277. """
  1278. #: This is the default configuration for our form parser.
  1279. #: Note: all file sizes should be in bytes.
  1280. DEFAULT_CONFIG = {
  1281. 'MAX_BODY_SIZE': float('inf'),
  1282. 'MAX_MEMORY_FILE_SIZE': 1 * 1024 * 1024,
  1283. 'UPLOAD_DIR': None,
  1284. 'UPLOAD_KEEP_FILENAME': False,
  1285. 'UPLOAD_KEEP_EXTENSIONS': False,
  1286. # Error on invalid Content-Transfer-Encoding?
  1287. 'UPLOAD_ERROR_ON_BAD_CTE': False,
  1288. }
  1289. def __init__(self, content_type, on_field, on_file, on_end=None,
  1290. boundary=None, file_name=None, FileClass=File,
  1291. FieldClass=Field, config={}):
  1292. self.logger = logging.getLogger(__name__)
  1293. # Save variables.
  1294. self.content_type = content_type
  1295. self.boundary = boundary
  1296. self.bytes_received = 0
  1297. self.parser = None
  1298. # Save callbacks.
  1299. self.on_field = on_field
  1300. self.on_file = on_file
  1301. self.on_end = on_end
  1302. # Save classes.
  1303. self.FileClass = File
  1304. self.FieldClass = Field
  1305. # Set configuration options.
  1306. self.config = self.DEFAULT_CONFIG.copy()
  1307. self.config.update(config)
  1308. # Depending on the Content-Type, we instantiate the correct parser.
  1309. if content_type == 'application/octet-stream':
  1310. # Work around the lack of 'nonlocal' in Py2
  1311. class vars(object):
  1312. f = None
  1313. def on_start():
  1314. vars.f = FileClass(file_name, None, config=self.config)
  1315. def on_data(data, start, end):
  1316. vars.f.write(data[start:end])
  1317. def on_end():
  1318. # Finalize the file itself.
  1319. vars.f.finalize()
  1320. # Call our callback.
  1321. on_file(vars.f)
  1322. # Call the on-end callback.
  1323. if self.on_end is not None:
  1324. self.on_end()
  1325. callbacks = {
  1326. 'on_start': on_start,
  1327. 'on_data': on_data,
  1328. 'on_end': on_end,
  1329. }
  1330. # Instantiate an octet-stream parser
  1331. parser = OctetStreamParser(callbacks,
  1332. max_size=self.config['MAX_BODY_SIZE'])
  1333. elif (content_type == 'application/x-www-form-urlencoded' or
  1334. content_type == 'application/x-url-encoded'):
  1335. name_buffer = []
  1336. class vars(object):
  1337. f = None
  1338. def on_field_start():
  1339. pass
  1340. def on_field_name(data, start, end):
  1341. name_buffer.append(data[start:end])
  1342. def on_field_data(data, start, end):
  1343. if vars.f is None:
  1344. vars.f = FieldClass(b''.join(name_buffer))
  1345. del name_buffer[:]
  1346. vars.f.write(data[start:end])
  1347. def on_field_end():
  1348. # Finalize and call callback.
  1349. if vars.f is None:
  1350. # If we get here, it's because there was no field data.
  1351. # We create a field, set it to None, and then continue.
  1352. vars.f = FieldClass(b''.join(name_buffer))
  1353. del name_buffer[:]
  1354. vars.f.set_none()
  1355. vars.f.finalize()
  1356. on_field(vars.f)
  1357. vars.f = None
  1358. def on_end():
  1359. if self.on_end is not None:
  1360. self.on_end()
  1361. # Setup callbacks.
  1362. callbacks = {
  1363. 'on_field_start': on_field_start,
  1364. 'on_field_name': on_field_name,
  1365. 'on_field_data': on_field_data,
  1366. 'on_field_end': on_field_end,
  1367. 'on_end': on_end,
  1368. }
  1369. # Instantiate parser.
  1370. parser = QuerystringParser(
  1371. callbacks=callbacks,
  1372. max_size=self.config['MAX_BODY_SIZE']
  1373. )
  1374. elif content_type == 'multipart/form-data':
  1375. if boundary is None:
  1376. self.logger.error("No boundary given")
  1377. raise FormParserError("No boundary given")
  1378. header_name = []
  1379. header_value = []
  1380. headers = {}
  1381. # No 'nonlocal' on Python 2 :-(
  1382. class vars(object):
  1383. f = None
  1384. writer = None
  1385. is_file = False
  1386. def on_part_begin():
  1387. pass
  1388. def on_part_data(data, start, end):
  1389. bytes_processed = vars.writer.write(data[start:end])
  1390. # TODO: check for error here.
  1391. return bytes_processed
  1392. def on_part_end():
  1393. vars.f.finalize()
  1394. if vars.is_file:
  1395. on_file(vars.f)
  1396. else:
  1397. on_field(vars.f)
  1398. def on_header_field(data, start, end):
  1399. header_name.append(data[start:end])
  1400. def on_header_value(data, start, end):
  1401. header_value.append(data[start:end])
  1402. def on_header_end():
  1403. headers[b''.join(header_name)] = b''.join(header_value)
  1404. del header_name[:]
  1405. del header_value[:]
  1406. def on_headers_finished():
  1407. # Reset the 'is file' flag.
  1408. vars.is_file = False
  1409. # Parse the content-disposition header.
  1410. # TODO: handle mixed case
  1411. content_disp = headers.get(b'Content-Disposition')
  1412. disp, options = parse_options_header(content_disp)
  1413. # Get the field and filename.
  1414. field_name = options.get(b'name')
  1415. file_name = options.get(b'filename')
  1416. # TODO: check for errors
  1417. # Create the proper class.
  1418. if file_name is None:
  1419. vars.f = FieldClass(field_name)
  1420. else:
  1421. vars.f = FileClass(file_name, field_name, config=self.config)
  1422. vars.is_file = True
  1423. # Parse the given Content-Transfer-Encoding to determine what
  1424. # we need to do with the incoming data.
  1425. # TODO: check that we properly handle 8bit / 7bit encoding.
  1426. transfer_encoding = headers.get(b'Content-Transfer-Encoding',
  1427. b'7bit')
  1428. if (transfer_encoding == b'binary' or
  1429. transfer_encoding == b'8bit' or
  1430. transfer_encoding == b'7bit'):
  1431. vars.writer = vars.f
  1432. elif transfer_encoding == b'base64':
  1433. vars.writer = Base64Decoder(vars.f)
  1434. elif transfer_encoding == b'quoted-printable':
  1435. vars.writer = QuotedPrintableDecoder(vars.f)
  1436. else:
  1437. self.logger.warning("Unknown Content-Transfer-Encoding: "
  1438. "%r", transfer_encoding)
  1439. if self.config['UPLOAD_ERROR_ON_BAD_CTE']:
  1440. raise FormParserError(
  1441. 'Unknown Content-Transfer-Encoding "{0}"'.format(
  1442. transfer_encoding
  1443. )
  1444. )
  1445. else:
  1446. # If we aren't erroring, then we just treat this as an
  1447. # unencoded Content-Transfer-Encoding.
  1448. vars.writer = vars.f
  1449. def on_end():
  1450. vars.writer.finalize()
  1451. if self.on_end is not None:
  1452. self.on_end()
  1453. # These are our callbacks for the parser.
  1454. callbacks = {
  1455. 'on_part_begin': on_part_begin,
  1456. 'on_part_data': on_part_data,
  1457. 'on_part_end': on_part_end,
  1458. 'on_header_field': on_header_field,
  1459. 'on_header_value': on_header_value,
  1460. 'on_header_end': on_header_end,
  1461. 'on_headers_finished': on_headers_finished,
  1462. 'on_end': on_end,
  1463. }
  1464. # Instantiate a multipart parser.
  1465. parser = MultipartParser(boundary, callbacks,
  1466. max_size=self.config['MAX_BODY_SIZE'])
  1467. else:
  1468. self.logger.warning("Unknown Content-Type: %r", content_type)
  1469. raise FormParserError("Unknown Content-Type: {0}".format(
  1470. content_type
  1471. ))
  1472. self.parser = parser
  1473. def write(self, data):
  1474. """Write some data. The parser will forward this to the appropriate
  1475. underlying parser.
  1476. :param data: a bytestring
  1477. """
  1478. self.bytes_received += len(data)
  1479. # TODO: check the parser's return value for errors?
  1480. return self.parser.write(data)
  1481. def finalize(self):
  1482. """Finalize the parser."""
  1483. if self.parser is not None and hasattr(self.parser, 'finalize'):
  1484. self.parser.finalize()
  1485. def close(self):
  1486. """Close the parser."""
  1487. if self.parser is not None and hasattr(self.parser, 'close'):
  1488. self.parser.close()
  1489. def __repr__(self):
  1490. return "%s(content_type=%r, parser=%r)" % (
  1491. self.__class__.__name__,
  1492. self.content_type,
  1493. self.parser,
  1494. )
  1495. def create_form_parser(headers, on_field, on_file, trust_x_headers=False,
  1496. config={}):
  1497. """This function is a helper function to aid in creating a FormParser
  1498. instances. Given a dictionary-like headers object, it will determine
  1499. the correct information needed, instantiate a FormParser with the
  1500. appropriate values and given callbacks, and then return the corresponding
  1501. parser.
  1502. :param headers: A dictionary-like object of HTTP headers. The only
  1503. required header is Content-Type.
  1504. :param on_field: Callback to call with each parsed field.
  1505. :param on_file: Callback to call with each parsed file.
  1506. :param trust_x_headers: Whether or not to trust information received from
  1507. certain X-Headers - for example, the file name from
  1508. X-File-Name.
  1509. :param config: Configuration variables to pass to the FormParser.
  1510. """
  1511. content_type = headers.get('Content-Type')
  1512. if content_type is None:
  1513. logging.getLogger(__name__).warning("No Content-Type header given")
  1514. raise ValueError("No Content-Type header given!")
  1515. # Boundaries are optional (the FormParser will raise if one is needed
  1516. # but not given).
  1517. content_type, params = parse_options_header(content_type)
  1518. boundary = params.get(b'boundary')
  1519. # We need content_type to be a string, not a bytes object.
  1520. content_type = content_type.decode('latin-1')
  1521. # File names are optional.
  1522. file_name = headers.get('X-File-Name')
  1523. # Instantiate a form parser.
  1524. form_parser = FormParser(content_type,
  1525. on_field,
  1526. on_file,
  1527. boundary=boundary,
  1528. file_name=file_name,
  1529. config=config)
  1530. # Return our parser.
  1531. return form_parser
  1532. def parse_form(headers, input_stream, on_field, on_file, chunk_size=1048576,
  1533. **kwargs):
  1534. """This function is useful if you just want to parse a request body,
  1535. without too much work. Pass it a dictionary-like object of the request's
  1536. headers, and a file-like object for the input stream, along with two
  1537. callbacks that will get called whenever a field or file is parsed.
  1538. :param headers: A dictionary-like object of HTTP headers. The only
  1539. required header is Content-Type.
  1540. :param input_stream: A file-like object that represents the request body.
  1541. The read() method must return bytestrings.
  1542. :param on_field: Callback to call with each parsed field.
  1543. :param on_file: Callback to call with each parsed file.
  1544. :param chunk_size: The maximum size to read from the input stream and write
  1545. to the parser at one time. Defaults to 1 MiB.
  1546. """
  1547. # Create our form parser.
  1548. parser = create_form_parser(headers, on_field, on_file)
  1549. # Read chunks of 100KiB and write to the parser, but never read more than
  1550. # the given Content-Length, if any.
  1551. content_length = headers.get('Content-Length')
  1552. if content_length is not None:
  1553. content_length = int(content_length)
  1554. else:
  1555. content_length = float('inf')
  1556. bytes_read = 0
  1557. while True:
  1558. # Read only up to the Content-Length given.
  1559. max_readable = min(content_length - bytes_read, 1048576)
  1560. buff = input_stream.read(max_readable)
  1561. # Write to the parser and update our length.
  1562. parser.write(buff)
  1563. bytes_read += len(buff)
  1564. # If we get a buffer that's smaller than the size requested, or if we
  1565. # have read up to our content length, we're done.
  1566. if len(buff) != max_readable or bytes_read == content_length:
  1567. break
  1568. # Tell our parser that we're done writing data.
  1569. parser.finalize()