|
- # We use native strings for all the re patterns, to take advantage of string
- # formatting, and then convert to bytestrings when compiling the final re
- # objects.
-
- # https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#whitespace
- # OWS = *( SP / HTAB )
- # ; optional whitespace
- OWS = r"[ \t]*"
-
- # https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#rule.token.separators
- # token = 1*tchar
- #
- # tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*"
- # / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
- # / DIGIT / ALPHA
- # ; any VCHAR, except delimiters
- token = r"[-!#$%&'*+.^_`|~0-9a-zA-Z]+"
-
- # https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#header.fields
- # field-name = token
- field_name = token
-
- # The standard says:
- #
- # field-value = *( field-content / obs-fold )
- # field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ]
- # field-vchar = VCHAR / obs-text
- # obs-fold = CRLF 1*( SP / HTAB )
- # ; obsolete line folding
- # ; see Section 3.2.4
- #
- # https://tools.ietf.org/html/rfc5234#appendix-B.1
- #
- # VCHAR = %x21-7E
- # ; visible (printing) characters
- #
- # https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#rule.quoted-string
- # obs-text = %x80-FF
- #
- # However, the standard definition of field-content is WRONG! It disallows
- # fields containing a single visible character surrounded by whitespace,
- # e.g. "foo a bar".
- #
- # See: https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189
- #
- # So our definition of field_content attempts to fix it up...
- vchar_or_obs_text = r"[\x21-\xff]"
- field_vchar = vchar_or_obs_text
- field_content = r"{field_vchar}+(?:[ \t]+{field_vchar}+)*".format(**globals())
-
- # We handle obs-fold at a different level, and our fixed-up field_content
- # already grows to swallow the whole value, so ? instead of *
- field_value = r"({field_content})?".format(**globals())
-
- # header-field = field-name ":" OWS field-value OWS
- header_field = (
- r"(?P<field_name>{field_name})"
- r":"
- r"{OWS}"
- r"(?P<field_value>{field_value})"
- r"{OWS}"
- .format(**globals()))
-
- # https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#request.line
- #
- # request-line = method SP request-target SP HTTP-version CRLF
- # method = token
- # HTTP-version = HTTP-name "/" DIGIT "." DIGIT
- # HTTP-name = %x48.54.54.50 ; "HTTP", case-sensitive
- #
- # request-target is complicated (see RFC 7230 sec 5.3) -- could be path, full
- # URL, host+port (for connect), or even "*", but in any case we are guaranteed
- # that it contains no spaces (see sec 3.1.1).
- method = token
- request_target = r"[^ ]+"
- http_version = r"HTTP/(?P<http_version>[0-9]\.[0-9])"
- request_line = (
- r"(?P<method>{method})"
- r" "
- r"(?P<target>{request_target})"
- r" "
- r"{http_version}"
- .format(**globals()))
-
- # https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#status.line
- #
- # status-line = HTTP-version SP status-code SP reason-phrase CRLF
- # status-code = 3DIGIT
- # reason-phrase = *( HTAB / SP / VCHAR / obs-text )
- status_code = r"[0-9]{3}"
- reason_phrase = r"([ \t]|{vchar_or_obs_text})*".format(**globals())
- status_line = (
- r"{http_version}"
- r" "
- r"(?P<status_code>{status_code})"
- # However, there are apparently a few too many servers out there that just
- # leave out the reason phrase:
- # https://github.com/scrapy/scrapy/issues/345#issuecomment-281756036
- # https://github.com/seanmonstar/httparse/issues/29
- # so make it optional. ?: is a non-capturing group.
- r"(?: (?P<reason>{reason_phrase}))?"
- .format(**globals()))
-
- HEXDIG = r"[0-9A-Fa-f]"
- # Actually
- #
- # chunk-size = 1*HEXDIG
- #
- # but we impose an upper-limit to avoid ridiculosity. len(str(2**64)) == 20
- chunk_size = r"({HEXDIG}){{1,20}}".format(**globals())
- # Actually
- #
- # chunk-ext = *( ";" chunk-ext-name [ "=" chunk-ext-val ] )
- #
- # but we aren't parsing the things so we don't really care.
- chunk_ext = r";.*"
- chunk_header = (
- r"(?P<chunk_size>{chunk_size})"
- r"(?P<chunk_ext>{chunk_ext})?"
- r"\r\n"
- .format(**globals()))
|