From 535c58ec3fa902798c9b3442ec00bc1d7084e308 Mon Sep 17 00:00:00 2001 From: Andrew Svetlov Date: Mon, 13 Dec 2021 15:21:18 +0200 Subject: [PATCH] Support absolute-form and authority-form URLs by web server (#6409) Co-authored-by: Sviatoslav Sydorenko --- CHANGES/6227.bugfix | 1 + aiohttp/_http_parser.pyx | 72 ++++++++++++++++++++-------------- aiohttp/http_parser.py | 39 ++++++++++++------ aiohttp/web_request.py | 14 ++++++- tests/test_http_parser.py | 24 +++++++++++- tests/test_proxy_functional.py | 23 ++++++----- tests/test_web_request.py | 8 ++++ 7 files changed, 123 insertions(+), 58 deletions(-) create mode 100644 CHANGES/6227.bugfix diff --git a/CHANGES/6227.bugfix b/CHANGES/6227.bugfix new file mode 100644 index 00000000000..df097565bcd --- /dev/null +++ b/CHANGES/6227.bugfix @@ -0,0 +1 @@ +Started supporting ``authority-form`` and ``absolute-form`` URLs on the server-side. diff --git a/aiohttp/_http_parser.pyx b/aiohttp/_http_parser.pyx index b76d723fb2e..bebd9894374 100644 --- a/aiohttp/_http_parser.pyx +++ b/aiohttp/_http_parser.pyx @@ -425,7 +425,7 @@ cdef class HttpParser: raw_headers = tuple(self._raw_headers) headers = CIMultiDictProxy(self._headers) - if upgrade or self._cparser.method == 5: # cparser.CONNECT: + if upgrade or self._cparser.method == cparser.HTTP_CONNECT: self._upgraded = True # do not support old websocket spec @@ -453,7 +453,7 @@ cdef class HttpParser: if ( ULLONG_MAX > self._cparser.content_length > 0 or chunked or - self._cparser.method == 5 or # CONNECT: 5 + self._cparser.method == cparser.HTTP_CONNECT or (self._cparser.status_code >= 199 and self._cparser.content_length == 0 and self._read_until_eof) @@ -586,34 +586,45 @@ cdef class HttpRequestParser(HttpParser): self._path = self._buf.decode('utf-8', 'surrogateescape') try: idx3 = len(self._path) - idx1 = self._path.find("?") - if idx1 == -1: - query = "" - idx2 = self._path.find("#") - if idx2 == -1: - path = self._path - fragment = "" - else: - path = self._path[0: idx2] - fragment = self._path[idx2+1:] + if self._cparser.method == cparser.HTTP_CONNECT: + # authority-form, + # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.3 + self._url = URL.build(authority=self._path, encoded=True) + elif idx3 > 1 and self._path[0] == '/': + # origin-form, + # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.1 + idx1 = self._path.find("?") + if idx1 == -1: + query = "" + idx2 = self._path.find("#") + if idx2 == -1: + path = self._path + fragment = "" + else: + path = self._path[0: idx2] + fragment = self._path[idx2+1:] - else: - path = self._path[0:idx1] - idx1 += 1 - idx2 = self._path.find("#", idx1+1) - if idx2 == -1: - query = self._path[idx1:] - fragment = "" else: - query = self._path[idx1: idx2] - fragment = self._path[idx2+1:] - - self._url = URL.build( - path=path, - query_string=query, - fragment=fragment, - encoded=True, - ) + path = self._path[0:idx1] + idx1 += 1 + idx2 = self._path.find("#", idx1+1) + if idx2 == -1: + query = self._path[idx1:] + fragment = "" + else: + query = self._path[idx1: idx2] + fragment = self._path[idx2+1:] + + self._url = URL.build( + path=path, + query_string=query, + fragment=fragment, + encoded=True, + ) + else: + # absolute-form for proxy maybe, + # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.2 + self._url = URL(self._path, encoded=True) finally: PyByteArray_Resize(self._buf, 0) @@ -726,7 +737,10 @@ cdef int cb_on_headers_complete(cparser.llhttp_t* parser) except -1: pyparser._last_error = exc return -1 else: - if pyparser._cparser.upgrade or pyparser._cparser.method == 5: # CONNECT + if ( + pyparser._cparser.upgrade or + pyparser._cparser.method == cparser.HTTP_CONNECT + ): return 2 else: return 0 diff --git a/aiohttp/http_parser.py b/aiohttp/http_parser.py index 2dc9482f4fa..aba659eed6e 100644 --- a/aiohttp/http_parser.py +++ b/aiohttp/http_parser.py @@ -532,9 +532,6 @@ def parse_message(self, lines: List[bytes]) -> RawRequestMessage: "Status line is too long", str(self.max_line_size), str(len(path)) ) - path_part, _hash_separator, url_fragment = path.partition("#") - path_part, _question_mark_separator, qs_part = path_part.partition("?") - # method if not METHRE.match(method): raise BadStatusLine(method) @@ -549,6 +546,31 @@ def parse_message(self, lines: List[bytes]) -> RawRequestMessage: except Exception: raise BadStatusLine(version) + if method == "CONNECT": + # authority-form, + # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.3 + url = URL.build(authority=path, encoded=True) + elif path.startswith("/"): + # origin-form, + # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.1 + path_part, _hash_separator, url_fragment = path.partition("#") + path_part, _question_mark_separator, qs_part = path_part.partition("?") + + # NOTE: `yarl.URL.build()` is used to mimic what the Cython-based + # NOTE: parser does, otherwise it results into the same + # NOTE: HTTP Request-Line input producing different + # NOTE: `yarl.URL()` objects + url = URL.build( + path=path_part, + query_string=qs_part, + fragment=url_fragment, + encoded=True, + ) + else: + # absolute-form for proxy maybe, + # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.2 + url = URL(path, encoded=True) + # read headers ( headers, @@ -575,16 +597,7 @@ def parse_message(self, lines: List[bytes]) -> RawRequestMessage: compression, upgrade, chunked, - # NOTE: `yarl.URL.build()` is used to mimic what the Cython-based - # NOTE: parser does, otherwise it results into the same - # NOTE: HTTP Request-Line input producing different - # NOTE: `yarl.URL()` objects - URL.build( - path=path_part, - query_string=qs_part, - fragment=url_fragment, - encoded=True, - ), + url, ) diff --git a/aiohttp/web_request.py b/aiohttp/web_request.py index b3574cafb34..e63b69778a9 100644 --- a/aiohttp/web_request.py +++ b/aiohttp/web_request.py @@ -164,14 +164,24 @@ def __init__( self._headers = message.headers self._method = message.method self._version = message.version - self._rel_url = message.url + self._cache = {} # type: Dict[str, Any] + url = message.url + if url.is_absolute(): + # absolute URL is given, + # override auto-calculating url, host, and scheme + # all other properties should be good + self._cache["url"] = url + self._cache["host"] = url.host + self._cache["scheme"] = url.scheme + self._rel_url = url.relative() + else: + self._rel_url = message.url self._post = ( None ) # type: Optional[MultiDictProxy[Union[str, bytes, FileField]]] self._read_bytes = None # type: Optional[bytes] self._state = state - self._cache = {} # type: Dict[str, Any] self._task = task self._client_max_size = client_max_size self._loop = loop diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py index 20b213d9ae9..26cf1b989f1 100644 --- a/tests/test_http_parser.py +++ b/tests/test_http_parser.py @@ -362,7 +362,15 @@ def test_compression_unknown(parser) -> None: assert msg.compression is None -def test_headers_connect(parser) -> None: +def test_url_connect(parser: Any) -> None: + text = b"CONNECT www.google.com HTTP/1.1\r\n" b"content-length: 0\r\n\r\n" + messages, upgrade, tail = parser.feed_data(text) + msg, payload = messages[0] + assert upgrade + assert msg.url == URL.build(authority="www.google.com") + + +def test_headers_connect(parser: Any) -> None: text = b"CONNECT www.google.com HTTP/1.1\r\n" b"content-length: 0\r\n\r\n" messages, upgrade, tail = parser.feed_data(text) msg, payload = messages[0] @@ -370,7 +378,19 @@ def test_headers_connect(parser) -> None: assert isinstance(payload, streams.StreamReader) -def test_headers_old_websocket_key1(parser) -> None: +def test_url_absolute(parser: Any) -> None: + text = ( + b"GET https://www.google.com/path/to.html HTTP/1.1\r\n" + b"content-length: 0\r\n\r\n" + ) + messages, upgrade, tail = parser.feed_data(text) + msg, payload = messages[0] + assert not upgrade + assert msg.method == "GET" + assert msg.url == URL("https://www.google.com/path/to.html") + + +def test_headers_old_websocket_key1(parser: Any) -> None: text = b"GET /test HTTP/1.1\r\n" b"SEC-WEBSOCKET-KEY1: line\r\n\r\n" with pytest.raises(http_exceptions.BadHttpMessage): diff --git a/tests/test_proxy_functional.py b/tests/test_proxy_functional.py index e27aac40d8d..3cb4c7e28f3 100644 --- a/tests/test_proxy_functional.py +++ b/tests/test_proxy_functional.py @@ -341,29 +341,28 @@ async def test_proxy_http_absolute_path(proxy_test_server, get_request) -> None: assert len(proxy.requests_list) == 1 assert proxy.request.method == "GET" assert proxy.request.host == "aiohttp.io" - assert proxy.request.path_qs == "http://aiohttp.io/path?query=yes" + assert proxy.request.path_qs == "/path?query=yes" async def test_proxy_http_raw_path(proxy_test_server, get_request) -> None: url = "http://aiohttp.io:2561/space sheep?q=can:fly" - raw_url = "http://aiohttp.io:2561/space%20sheep?q=can:fly" + raw_url = "/space%20sheep?q=can:fly" proxy = await proxy_test_server() await get_request(url=url, proxy=proxy.url) - assert proxy.request.host == "aiohttp.io:2561" + assert proxy.request.host == "aiohttp.io" assert proxy.request.path_qs == raw_url async def test_proxy_http_idna_support(proxy_test_server, get_request) -> None: url = "http://éé.com/" - raw_url = "http://xn--9caa.com/" proxy = await proxy_test_server() await get_request(url=url, proxy=proxy.url) - assert proxy.request.host == "xn--9caa.com" - assert proxy.request.path_qs == raw_url + assert proxy.request.host == "éé.com" + assert proxy.request.path_qs == "/" async def test_proxy_http_connection_error(get_request) -> None: @@ -759,7 +758,7 @@ async def test_proxy_from_env_http(proxy_test_server, get_request, mocker) -> No assert len(proxy.requests_list) == 1 assert proxy.request.method == "GET" assert proxy.request.host == "aiohttp.io" - assert proxy.request.path_qs == "http://aiohttp.io/path" + assert proxy.request.path_qs == "/path" assert "Proxy-Authorization" not in proxy.request.headers @@ -781,7 +780,7 @@ async def test_proxy_from_env_http_with_auth(proxy_test_server, get_request, moc assert len(proxy.requests_list) == 1 assert proxy.request.method == "GET" assert proxy.request.host == "aiohttp.io" - assert proxy.request.path_qs == "http://aiohttp.io/path" + assert proxy.request.path_qs == "/path" assert proxy.request.headers["Proxy-Authorization"] == auth.encode() @@ -807,7 +806,7 @@ async def test_proxy_from_env_http_with_auth_from_netrc( assert len(proxy.requests_list) == 1 assert proxy.request.method == "GET" assert proxy.request.host == "aiohttp.io" - assert proxy.request.path_qs == "http://aiohttp.io/path" + assert proxy.request.path_qs == "/path" assert proxy.request.headers["Proxy-Authorization"] == auth.encode() @@ -833,7 +832,7 @@ async def test_proxy_from_env_http_without_auth_from_netrc( assert len(proxy.requests_list) == 1 assert proxy.request.method == "GET" assert proxy.request.host == "aiohttp.io" - assert proxy.request.path_qs == "http://aiohttp.io/path" + assert proxy.request.path_qs == "/path" assert "Proxy-Authorization" not in proxy.request.headers @@ -857,7 +856,7 @@ async def test_proxy_from_env_http_without_auth_from_wrong_netrc( assert len(proxy.requests_list) == 1 assert proxy.request.method == "GET" assert proxy.request.host == "aiohttp.io" - assert proxy.request.path_qs == "http://aiohttp.io/path" + assert proxy.request.path_qs == "/path" assert "Proxy-Authorization" not in proxy.request.headers @@ -873,7 +872,7 @@ async def xtest_proxy_from_env_https(proxy_test_server, get_request, mocker): assert len(proxy.requests_list) == 2 assert proxy.request.method == "GET" assert proxy.request.host == "aiohttp.io" - assert proxy.request.path_qs == "https://aiohttp.io/path" + assert proxy.request.path_qs == "/path" assert "Proxy-Authorization" not in proxy.request.headers diff --git a/tests/test_web_request.py b/tests/test_web_request.py index ab17da6ca23..4321deded15 100644 --- a/tests/test_web_request.py +++ b/tests/test_web_request.py @@ -157,6 +157,14 @@ def test_non_ascii_raw_path() -> None: assert "/путь" == req.raw_path +def test_absolute_url() -> None: + req = make_mocked_request("GET", "https://example.com/path/to?a=1") + assert req.url == URL("https://example.com/path/to?a=1") + assert req.scheme == "https" + assert req.host == "example.com" + assert req.rel_url == URL.build(path="/path/to", query={"a": "1"}) + + def test_content_length() -> None: req = make_mocked_request("Get", "/", CIMultiDict([("CONTENT-LENGTH", "123")]))