You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

165 lines
6.6 KiB

  1. import re
  2. from ._util import LocalProtocolError, bytesify, validate
  3. from ._abnf import field_name, field_value
  4. # Facts
  5. # -----
  6. #
  7. # Headers are:
  8. # keys: case-insensitive ascii
  9. # values: mixture of ascii and raw bytes
  10. #
  11. # "Historically, HTTP has allowed field content with text in the ISO-8859-1
  12. # charset [ISO-8859-1], supporting other charsets only through use of
  13. # [RFC2047] encoding. In practice, most HTTP header field values use only a
  14. # subset of the US-ASCII charset [USASCII]. Newly defined header fields SHOULD
  15. # limit their field values to US-ASCII octets. A recipient SHOULD treat other
  16. # octets in field content (obs-text) as opaque data."
  17. # And it deprecates all non-ascii values
  18. #
  19. # Leading/trailing whitespace in header names is forbidden
  20. #
  21. # Values get leading/trailing whitespace stripped
  22. #
  23. # Content-Disposition actually needs to contain unicode semantically; to
  24. # accomplish this it has a terrifically weird way of encoding the filename
  25. # itself as ascii (and even this still has lots of cross-browser
  26. # incompatibilities)
  27. #
  28. # Order is important:
  29. # "a proxy MUST NOT change the order of these field values when forwarding a
  30. # message"
  31. # (and there are several headers where the order indicates a preference)
  32. #
  33. # Multiple occurences of the same header:
  34. # "A sender MUST NOT generate multiple header fields with the same field name
  35. # in a message unless either the entire field value for that header field is
  36. # defined as a comma-separated list [or the header is Set-Cookie which gets a
  37. # special exception]" - RFC 7230. (cookies are in RFC 6265)
  38. #
  39. # So every header aside from Set-Cookie can be merged by b", ".join if it
  40. # occurs repeatedly. But, of course, they can't necessarily be split by
  41. # .split(b","), because quoting.
  42. #
  43. # Given all this mess (case insensitive, duplicates allowed, order is
  44. # important, ...), there doesn't appear to be any standard way to handle
  45. # headers in Python -- they're almost like dicts, but... actually just
  46. # aren't. For now we punt and just use a super simple representation: headers
  47. # are a list of pairs
  48. #
  49. # [(name1, value1), (name2, value2), ...]
  50. #
  51. # where all entries are bytestrings, names are lowercase and have no
  52. # leading/trailing whitespace, and values are bytestrings with no
  53. # leading/trailing whitespace. Searching and updating are done via naive O(n)
  54. # methods.
  55. #
  56. # Maybe a dict-of-lists would be better?
  57. _content_length_re = re.compile(br"[0-9]+")
  58. _field_name_re = re.compile(field_name.encode("ascii"))
  59. _field_value_re = re.compile(field_value.encode("ascii"))
  60. def normalize_and_validate(headers, _parsed=False):
  61. new_headers = []
  62. saw_content_length = False
  63. saw_transfer_encoding = False
  64. for name, value in headers:
  65. # For headers coming out of the parser, we can safely skip some steps,
  66. # because it always returns bytes and has already run these regexes
  67. # over the data:
  68. if not _parsed:
  69. name = bytesify(name)
  70. value = bytesify(value)
  71. validate(_field_name_re, name, "Illegal header name {!r}", name)
  72. validate(_field_value_re, value, "Illegal header value {!r}", value)
  73. name = name.lower()
  74. if name == b"content-length":
  75. if saw_content_length:
  76. raise LocalProtocolError("multiple Content-Length headers")
  77. validate(_content_length_re, value, "bad Content-Length")
  78. saw_content_length = True
  79. if name == b"transfer-encoding":
  80. # "A server that receives a request message with a transfer coding
  81. # it does not understand SHOULD respond with 501 (Not
  82. # Implemented)."
  83. # https://tools.ietf.org/html/rfc7230#section-3.3.1
  84. if saw_transfer_encoding:
  85. raise LocalProtocolError("multiple Transfer-Encoding headers",
  86. error_status_hint=501)
  87. # "All transfer-coding names are case-insensitive"
  88. # -- https://tools.ietf.org/html/rfc7230#section-4
  89. value = value.lower()
  90. if value != b"chunked":
  91. raise LocalProtocolError(
  92. "Only Transfer-Encoding: chunked is supported",
  93. error_status_hint=501)
  94. saw_transfer_encoding = True
  95. new_headers.append((name, value))
  96. return new_headers
  97. def get_comma_header(headers, name):
  98. # Should only be used for headers whose value is a list of
  99. # comma-separated, case-insensitive values.
  100. #
  101. # The header name `name` is expected to be lower-case bytes.
  102. #
  103. # Connection: meets these criteria (including cast insensitivity).
  104. #
  105. # Content-Length: technically is just a single value (1*DIGIT), but the
  106. # standard makes reference to implementations that do multiple values, and
  107. # using this doesn't hurt. Ditto, case insensitivity doesn't things either
  108. # way.
  109. #
  110. # Transfer-Encoding: is more complex (allows for quoted strings), so
  111. # splitting on , is actually wrong. For example, this is legal:
  112. #
  113. # Transfer-Encoding: foo; options="1,2", chunked
  114. #
  115. # and should be parsed as
  116. #
  117. # foo; options="1,2"
  118. # chunked
  119. #
  120. # but this naive function will parse it as
  121. #
  122. # foo; options="1
  123. # 2"
  124. # chunked
  125. #
  126. # However, this is okay because the only thing we are going to do with
  127. # any Transfer-Encoding is reject ones that aren't just "chunked", so
  128. # both of these will be treated the same anyway.
  129. #
  130. # Expect: the only legal value is the literal string
  131. # "100-continue". Splitting on commas is harmless. Case insensitive.
  132. #
  133. out = []
  134. for found_name, found_raw_value in headers:
  135. if found_name == name:
  136. found_raw_value = found_raw_value.lower()
  137. for found_split_value in found_raw_value.split(b","):
  138. found_split_value = found_split_value.strip()
  139. if found_split_value:
  140. out.append(found_split_value)
  141. return out
  142. def set_comma_header(headers, name, new_values):
  143. # The header name `name` is expected to be lower-case bytes.
  144. new_headers = []
  145. for found_name, found_raw_value in headers:
  146. if found_name != name:
  147. new_headers.append((found_name, found_raw_value))
  148. for new_value in new_values:
  149. new_headers.append((name, new_value))
  150. headers[:] = normalize_and_validate(new_headers)
  151. def has_expect_100_continue(request):
  152. # https://tools.ietf.org/html/rfc7231#section-5.1.1
  153. # "A server that receives a 100-continue expectation in an HTTP/1.0 request
  154. # MUST ignore that expectation."
  155. if request.http_version < b"1.1":
  156. return False
  157. expect = get_comma_header(request.headers, b"expect")
  158. return (b"100-continue" in expect)