Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(remote runner): disable retries by default #3155

Merged
merged 6 commits into from Oct 31, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
46 changes: 29 additions & 17 deletions src/bentoml/_internal/runner/runner_handle/remote.py
@@ -1,5 +1,6 @@
from __future__ import annotations

import os
import json
import pickle
import typing as t
Expand All @@ -21,6 +22,7 @@

if TYPE_CHECKING:
import yarl
import aiohttp
from aiohttp import BaseConnector
from aiohttp.client import ClientSession

Expand Down Expand Up @@ -126,10 +128,10 @@ def strip_query_params(url: yarl.URL) -> str:
)
return self._client_cache

def _reset_client(self):
async def _reset_client(self):
self._close_conn()
if self._client_cache is not None:
self._client_cache.close()
await self._client_cache.close()
self._client_cache = None

async def async_run_method(
Expand All @@ -138,6 +140,8 @@ async def async_run_method(
*args: P.args,
**kwargs: P.kwargs,
) -> R | tuple[R, ...]:
import aiohttp

from ...runner.container import AutoContainer

inp_batch_dim = __bentoml_method.config.batch_dim[0]
Expand Down Expand Up @@ -166,21 +170,29 @@ async def async_run_method(
},
) as resp:
body = await resp.read()
except aiohttp.ClientOSError:
# most likely the TCP connection has been closed; retry after reconnecting
self._reset_client()
async with self._client.post(
f"{self._addr}/{path}",
data=pickle.dumps(payload_params), # FIXME: pickle inside pickle
headers={
"Bento-Name": component_context.bento_name,
"Bento-Version": component_context.bento_version,
"Runner-Name": self._runner.name,
"Yatai-Bento-Deployment-Name": component_context.yatai_bento_deployment_name,
"Yatai-Bento-Deployment-Namespace": component_context.yatai_bento_deployment_namespace,
},
) as resp:
body = await resp.read()
except aiohttp.ClientOSError as e:
if os.getenv("BENTOML_RETRY_RUNNER_REQUESTS").lower() == "true":
try:
# most likely the TCP connection has been closed; retry after reconnecting
await self._reset_client()
async with self._client.post(
f"{self._addr}/{path}",
data=pickle.dumps(
payload_params
), # FIXME: pickle inside pickle
headers={
"Bento-Name": component_context.bento_name,
"Bento-Version": component_context.bento_version,
"Runner-Name": self._runner.name,
"Yatai-Bento-Deployment-Name": component_context.yatai_bento_deployment_name,
"Yatai-Bento-Deployment-Namespace": component_context.yatai_bento_deployment_namespace,
},
) as resp:
body = await resp.read()
except aiohttp.ClientOSError as e:
raise RemoteException(f"Failed to connect to runner server.")
else:
raise RemoteException(f"Failed to connect to runner server.") from e

try:
content_type = resp.headers["Content-Type"]
Expand Down