You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

283 line
10 KiB

  1. import re
  2. from typing import AnyStr, cast, List, overload, Sequence, Tuple, TYPE_CHECKING, Union
  3. from ._abnf import field_name, field_value
  4. from ._util import bytesify, LocalProtocolError, validate
  5. if TYPE_CHECKING:
  6. from ._events import Request
  7. try:
  8. from typing import Literal
  9. except ImportError:
  10. from typing_extensions import Literal # type: ignore
  11. CONTENT_LENGTH_MAX_DIGITS = 20 # allow up to 1 billion TB - 1
  12. # Facts
  13. # -----
  14. #
  15. # Headers are:
  16. # keys: case-insensitive ascii
  17. # values: mixture of ascii and raw bytes
  18. #
  19. # "Historically, HTTP has allowed field content with text in the ISO-8859-1
  20. # charset [ISO-8859-1], supporting other charsets only through use of
  21. # [RFC2047] encoding. In practice, most HTTP header field values use only a
  22. # subset of the US-ASCII charset [USASCII]. Newly defined header fields SHOULD
  23. # limit their field values to US-ASCII octets. A recipient SHOULD treat other
  24. # octets in field content (obs-text) as opaque data."
  25. # And it deprecates all non-ascii values
  26. #
  27. # Leading/trailing whitespace in header names is forbidden
  28. #
  29. # Values get leading/trailing whitespace stripped
  30. #
  31. # Content-Disposition actually needs to contain unicode semantically; to
  32. # accomplish this it has a terrifically weird way of encoding the filename
  33. # itself as ascii (and even this still has lots of cross-browser
  34. # incompatibilities)
  35. #
  36. # Order is important:
  37. # "a proxy MUST NOT change the order of these field values when forwarding a
  38. # message"
  39. # (and there are several headers where the order indicates a preference)
  40. #
  41. # Multiple occurences of the same header:
  42. # "A sender MUST NOT generate multiple header fields with the same field name
  43. # in a message unless either the entire field value for that header field is
  44. # defined as a comma-separated list [or the header is Set-Cookie which gets a
  45. # special exception]" - RFC 7230. (cookies are in RFC 6265)
  46. #
  47. # So every header aside from Set-Cookie can be merged by b", ".join if it
  48. # occurs repeatedly. But, of course, they can't necessarily be split by
  49. # .split(b","), because quoting.
  50. #
  51. # Given all this mess (case insensitive, duplicates allowed, order is
  52. # important, ...), there doesn't appear to be any standard way to handle
  53. # headers in Python -- they're almost like dicts, but... actually just
  54. # aren't. For now we punt and just use a super simple representation: headers
  55. # are a list of pairs
  56. #
  57. # [(name1, value1), (name2, value2), ...]
  58. #
  59. # where all entries are bytestrings, names are lowercase and have no
  60. # leading/trailing whitespace, and values are bytestrings with no
  61. # leading/trailing whitespace. Searching and updating are done via naive O(n)
  62. # methods.
  63. #
  64. # Maybe a dict-of-lists would be better?
  65. _content_length_re = re.compile(rb"[0-9]+")
  66. _field_name_re = re.compile(field_name.encode("ascii"))
  67. _field_value_re = re.compile(field_value.encode("ascii"))
  68. class Headers(Sequence[Tuple[bytes, bytes]]):
  69. """
  70. A list-like interface that allows iterating over headers as byte-pairs
  71. of (lowercased-name, value).
  72. Internally we actually store the representation as three-tuples,
  73. including both the raw original casing, in order to preserve casing
  74. over-the-wire, and the lowercased name, for case-insensitive comparisions.
  75. r = Request(
  76. method="GET",
  77. target="/",
  78. headers=[("Host", "example.org"), ("Connection", "keep-alive")],
  79. http_version="1.1",
  80. )
  81. assert r.headers == [
  82. (b"host", b"example.org"),
  83. (b"connection", b"keep-alive")
  84. ]
  85. assert r.headers.raw_items() == [
  86. (b"Host", b"example.org"),
  87. (b"Connection", b"keep-alive")
  88. ]
  89. """
  90. __slots__ = "_full_items"
  91. def __init__(self, full_items: List[Tuple[bytes, bytes, bytes]]) -> None:
  92. self._full_items = full_items
  93. def __bool__(self) -> bool:
  94. return bool(self._full_items)
  95. def __eq__(self, other: object) -> bool:
  96. return list(self) == list(other) # type: ignore
  97. def __len__(self) -> int:
  98. return len(self._full_items)
  99. def __repr__(self) -> str:
  100. return "<Headers(%s)>" % repr(list(self))
  101. def __getitem__(self, idx: int) -> Tuple[bytes, bytes]: # type: ignore[override]
  102. _, name, value = self._full_items[idx]
  103. return (name, value)
  104. def raw_items(self) -> List[Tuple[bytes, bytes]]:
  105. return [(raw_name, value) for raw_name, _, value in self._full_items]
  106. HeaderTypes = Union[
  107. List[Tuple[bytes, bytes]],
  108. List[Tuple[bytes, str]],
  109. List[Tuple[str, bytes]],
  110. List[Tuple[str, str]],
  111. ]
  112. @overload
  113. def normalize_and_validate(headers: Headers, _parsed: Literal[True]) -> Headers:
  114. ...
  115. @overload
  116. def normalize_and_validate(headers: HeaderTypes, _parsed: Literal[False]) -> Headers:
  117. ...
  118. @overload
  119. def normalize_and_validate(
  120. headers: Union[Headers, HeaderTypes], _parsed: bool = False
  121. ) -> Headers:
  122. ...
  123. def normalize_and_validate(
  124. headers: Union[Headers, HeaderTypes], _parsed: bool = False
  125. ) -> Headers:
  126. new_headers = []
  127. seen_content_length = None
  128. saw_transfer_encoding = False
  129. for name, value in headers:
  130. # For headers coming out of the parser, we can safely skip some steps,
  131. # because it always returns bytes and has already run these regexes
  132. # over the data:
  133. if not _parsed:
  134. name = bytesify(name)
  135. value = bytesify(value)
  136. validate(_field_name_re, name, "Illegal header name {!r}", name)
  137. validate(_field_value_re, value, "Illegal header value {!r}", value)
  138. assert isinstance(name, bytes)
  139. assert isinstance(value, bytes)
  140. raw_name = name
  141. name = name.lower()
  142. if name == b"content-length":
  143. lengths = {length.strip() for length in value.split(b",")}
  144. if len(lengths) != 1:
  145. raise LocalProtocolError("conflicting Content-Length headers")
  146. value = lengths.pop()
  147. validate(_content_length_re, value, "bad Content-Length")
  148. if len(value) > CONTENT_LENGTH_MAX_DIGITS:
  149. raise LocalProtocolError("bad Content-Length")
  150. if seen_content_length is None:
  151. seen_content_length = value
  152. new_headers.append((raw_name, name, value))
  153. elif seen_content_length != value:
  154. raise LocalProtocolError("conflicting Content-Length headers")
  155. elif name == b"transfer-encoding":
  156. # "A server that receives a request message with a transfer coding
  157. # it does not understand SHOULD respond with 501 (Not
  158. # Implemented)."
  159. # https://tools.ietf.org/html/rfc7230#section-3.3.1
  160. if saw_transfer_encoding:
  161. raise LocalProtocolError(
  162. "multiple Transfer-Encoding headers", error_status_hint=501
  163. )
  164. # "All transfer-coding names are case-insensitive"
  165. # -- https://tools.ietf.org/html/rfc7230#section-4
  166. value = value.lower()
  167. if value != b"chunked":
  168. raise LocalProtocolError(
  169. "Only Transfer-Encoding: chunked is supported",
  170. error_status_hint=501,
  171. )
  172. saw_transfer_encoding = True
  173. new_headers.append((raw_name, name, value))
  174. else:
  175. new_headers.append((raw_name, name, value))
  176. return Headers(new_headers)
  177. def get_comma_header(headers: Headers, name: bytes) -> List[bytes]:
  178. # Should only be used for headers whose value is a list of
  179. # comma-separated, case-insensitive values.
  180. #
  181. # The header name `name` is expected to be lower-case bytes.
  182. #
  183. # Connection: meets these criteria (including cast insensitivity).
  184. #
  185. # Content-Length: technically is just a single value (1*DIGIT), but the
  186. # standard makes reference to implementations that do multiple values, and
  187. # using this doesn't hurt. Ditto, case insensitivity doesn't things either
  188. # way.
  189. #
  190. # Transfer-Encoding: is more complex (allows for quoted strings), so
  191. # splitting on , is actually wrong. For example, this is legal:
  192. #
  193. # Transfer-Encoding: foo; options="1,2", chunked
  194. #
  195. # and should be parsed as
  196. #
  197. # foo; options="1,2"
  198. # chunked
  199. #
  200. # but this naive function will parse it as
  201. #
  202. # foo; options="1
  203. # 2"
  204. # chunked
  205. #
  206. # However, this is okay because the only thing we are going to do with
  207. # any Transfer-Encoding is reject ones that aren't just "chunked", so
  208. # both of these will be treated the same anyway.
  209. #
  210. # Expect: the only legal value is the literal string
  211. # "100-continue". Splitting on commas is harmless. Case insensitive.
  212. #
  213. out: List[bytes] = []
  214. for _, found_name, found_raw_value in headers._full_items:
  215. if found_name == name:
  216. found_raw_value = found_raw_value.lower()
  217. for found_split_value in found_raw_value.split(b","):
  218. found_split_value = found_split_value.strip()
  219. if found_split_value:
  220. out.append(found_split_value)
  221. return out
  222. def set_comma_header(headers: Headers, name: bytes, new_values: List[bytes]) -> Headers:
  223. # The header name `name` is expected to be lower-case bytes.
  224. #
  225. # Note that when we store the header we use title casing for the header
  226. # names, in order to match the conventional HTTP header style.
  227. #
  228. # Simply calling `.title()` is a blunt approach, but it's correct
  229. # here given the cases where we're using `set_comma_header`...
  230. #
  231. # Connection, Content-Length, Transfer-Encoding.
  232. new_headers: List[Tuple[bytes, bytes]] = []
  233. for found_raw_name, found_name, found_raw_value in headers._full_items:
  234. if found_name != name:
  235. new_headers.append((found_raw_name, found_raw_value))
  236. for new_value in new_values:
  237. new_headers.append((name.title(), new_value))
  238. return normalize_and_validate(new_headers)
  239. def has_expect_100_continue(request: "Request") -> bool:
  240. # https://tools.ietf.org/html/rfc7231#section-5.1.1
  241. # "A server that receives a 100-continue expectation in an HTTP/1.0 request
  242. # MUST ignore that expectation."
  243. if request.http_version < b"1.1":
  244. return False
  245. expect = get_comma_header(request.headers, b"expect")
  246. return b"100-continue" in expect