Ok I reduced the code because I’m not allowed to share it but thats what I’m basically doing:
I have a python script with this code:
@serve.deployment
class Lama(rayNuke_pb2_grpc.PredictAPIsServiceServicer, CustomgRPCIngress):
def __init__(self):
self.worker_ip = ray.util.get_node_ip_address()
handle = serve.get_deployment_handle('DeploymentInfo', 'DeploymentInfoApp')
grpc_port = handle.get_port.remote().result()
super().__init__(grpc_port)
lama_app = Lama.bind()
and another one with that:
# This class serves as registry wich manages grcp addresses of multiple machines
@serve.deployment
class DeploymentInfo(rayNuke_pb2_grpc.PredictAPIsServiceServicer, CustomgRPCIngress):
def __init__(self):
# do init stuff here
def get_port(self):
# generate port number for the machine
test_port = 6798
return test_port
DeploymentInfo_app = DeploymentInfo.bind()
I have a deployment config like this:
proxy_location: HeadOnly
http_options:
host: 0.0.0.0
port: 8000
applications:
- name: DeploymentInfoApp
route_prefix: /deployment
import_path: DeploymentInfo:DeploymentInfo_app
deployments:
- name: DeploymentInfo
num_replicas: 1
- name: LamaApp
route_prefix: /lama
import_path: lama_refined:lama_app
deployments:
- name: Lama
num_replicas: 1
ray_actor_options:
num_cpus: 8
num_gpus: 1
When I test this with serve run deployment_config.yaml it says:
(ServeController pid=89680, ip=10.14.71.23) INFO 2023-11-07 11:46:13,891 controller 89680 deployment_state.py:2027 - Replica LamaApp#Lama#kZRiet is stopped.
(ServeController pid=89680, ip=10.14.71.23) INFO 2023-11-07 11:46:13,892 controller 89680 deployment_state.py:1679 - Adding 1 replica to deployment Lama in application 'LamaApp'.
(ServeReplica:DeploymentInfoApp:DeploymentInfo pid=29204, ip=10.14.71.23) INFO 2023-11-07 11:46:13,905 DeploymentInfo DeploymentInfoApp#DeploymentInfo#SkdLUJ DeploymentInfoApp replica.py:749 - ADD_ACTOR OK 0.0ms
(HTTPProxyActor pid=2352) INFO 2023-11-07 20:46:15,112 http_proxy 10.14.228.50 http_proxy.py:1433 - Proxy actor cf679dc61c4acc07b975e4e210000000 starting on node c973b330b5ce3f9eee0b7cca3a9b8bb4c32bacc8d57885c1c19eec2e.
(HTTPProxyActor pid=2352) INFO 2023-11-07 20:46:15,121 http_proxy 10.14.228.50 http_proxy.py:1617 - Starting HTTP server on node: c973b330b5ce3f9eee0b7cca3a9b8bb4c32bacc8d57885c1c19eec2e listening on port 8000
(HTTPProxyActor pid=2352) INFO: Started server process [2352]
(ServeController pid=89680, ip=10.14.71.23) INFO 2023-11-07 11:46:15,524 controller 89680 http_state.py:260 - Start to drain the proxy actor on node c973b330b5ce3f9eee0b7cca3a9b8bb4c32bacc8d57885c1c19eec2e
(ServeReplica:LamaApp:Lama pid=1660) DeprecationWarning: `route_prefix` in `@serve.deployment` has been deprecated. To specify a route prefix for an application, pass it into `serve.run` instead.
(ServeReplica:LamaApp:Lama pid=1660) Detectron v2 is not installed
(ServeController pid=89680, ip=10.14.71.23) INFO 2023-11-07 11:46:28,045 controller 89680 http_state.py:271 - Stop draining the proxy actor on node c973b330b5ce3f9eee0b7cca3a9b8bb4c32bacc8d57885c1c19eec2e
(ServeController pid=89680, ip=10.14.71.23) ERROR 2023-11-07 11:46:28,150 controller 89680 deployment_state.py:617 - Exception in replica 'LamaApp#Lama#JgXThC', the replica will be stopped.
(ServeController pid=89680, ip=10.14.71.23) Traceback (most recent call last):
(ServeController pid=89680, ip=10.14.71.23) File "S:\local_repos\envs\torch_py_39_trt_onnx\lib\site-packages\ray\serve\_private\deployment_state.py", line 615, in check_ready
(ServeController pid=89680, ip=10.14.71.23) _, self._version = ray.get(self._ready_obj_ref)
(ServeController pid=89680, ip=10.14.71.23) File "S:\local_repos\envs\torch_py_39_trt_onnx\lib\site-packages\ray\_private\auto_init_hook.py", line 24, in auto_init_wrapper
(ServeController pid=89680, ip=10.14.71.23) return fn(*args, **kwargs)
(ServeController pid=89680, ip=10.14.71.23) File "S:\local_repos\envs\torch_py_39_trt_onnx\lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
(ServeController pid=89680, ip=10.14.71.23) return func(*args, **kwargs)
(ServeController pid=89680, ip=10.14.71.23) File "S:\local_repos\envs\torch_py_39_trt_onnx\lib\site-packages\ray\_private\worker.py", line 2547, in get
(ServeController pid=89680, ip=10.14.71.23) raise value.as_instanceof_cause()
(ServeController pid=89680, ip=10.14.71.23) ray.exceptions.RayTaskError(RuntimeError): ray::ServeReplica:LamaApp:Lama.initialize_and_get_metadata() (pid=1660, ip=10.14.228.50, actor_id=9e6a32709d85e1ff757061ee10000000, repr=<ray.ser
ve._private.replica.ServeReplica:LamaApp:Lama object at 0x0000029E25196940>)
(ServeController pid=89680, ip=10.14.71.23) File "python\ray\_raylet.pyx", line 1616, in ray._raylet.execute_task
(ServeController pid=89680, ip=10.14.71.23) File "python\ray\_raylet.pyx", line 1551, in ray._raylet.execute_task.function_executor
(ServeController pid=89680, ip=10.14.71.23) File "python\ray\_raylet.pyx", line 4283, in ray._raylet.CoreWorker.run_async_func_or_coro_in_event_loop
(ServeController pid=89680, ip=10.14.71.23) File "C:\Python39\lib\concurrent\futures\_base.py", line 445, in result
(ServeController pid=89680, ip=10.14.71.23) return self.__get_result()
(ServeController pid=89680, ip=10.14.71.23) File "C:\Python39\lib\concurrent\futures\_base.py", line 390, in __get_result
(ServeController pid=89680, ip=10.14.71.23) raise self._exception
(ServeController pid=89680, ip=10.14.71.23) File "python\ray\_raylet.pyx", line 4270, in async_func
(ServeController pid=89680, ip=10.14.71.23) File "S:\local_repos\envs\torch_py_39_trt_onnx\lib\site-packages\ray\util\tracing\tracing_helper.py", line 499, in _resume_span
(ServeController pid=89680, ip=10.14.71.23) return await method(self, *_args, **_kwargs)
(ServeController pid=89680, ip=10.14.71.23) File "S:\local_repos\envs\torch_py_39_trt_onnx\lib\site-packages\ray\serve\_private\replica.py", line 442, in initialize_and_get_metadata
(ServeController pid=89680, ip=10.14.71.23) raise RuntimeError(traceback.format_exc()) from None
(ServeController pid=89680, ip=10.14.71.23) RuntimeError: Traceback (most recent call last):
(ServeController pid=89680, ip=10.14.71.23) File "S:\local_repos\envs\torch_py_39_trt_onnx\lib\site-packages\ray\serve\_private\replica.py", line 430, in initialize_and_get_metadata
(ServeController pid=89680, ip=10.14.71.23) await self._initialize_replica()
(ServeController pid=89680, ip=10.14.71.23) File "S:\local_repos\envs\torch_py_39_trt_onnx\lib\site-packages\ray\serve\_private\replica.py", line 190, in initialize_replica
(ServeController pid=89680, ip=10.14.71.23) await sync_to_async(_callable.__init__)(*init_args, **init_kwargs)
(ServeController pid=89680, ip=10.14.71.23) File "S:\local_repos\envs\torch_py_39_trt_onnx\lib\site-packages\ray\_private\async_compat.py", line 34, in wrapper
(ServeController pid=89680, ip=10.14.71.23) return func(*args, **kwargs)
(ServeController pid=89680, ip=10.14.71.23) File "\\inferno2\projects\common\home\oku\repos\RAY\ray_server\deployment\scripts\lama_refined\lamaRefined.py", line 89, in __init__
(ServeController pid=89680, ip=10.14.71.23) grpc_port = self.info_actor.get_port.remote().result()
(ServeController pid=89680, ip=10.14.71.23) File "S:\local_repos\envs\torch_py_39_trt_onnx\lib\site-packages\ray\serve\handle.py", line 647, in result
(ServeController pid=89680, ip=10.14.71.23) self._to_object_ref_sync(_record_telemetry=False), timeout=timeout_s
(ServeController pid=89680, ip=10.14.71.23) File "S:\local_repos\envs\torch_py_39_trt_onnx\lib\site-packages\ray\serve\handle.py", line 677, in _to_object_ref_sync
(ServeController pid=89680, ip=10.14.71.23) return self._to_object_ref_or_gen_sync(_record_telemetry=_record_telemetry)
(ServeController pid=89680, ip=10.14.71.23) File "S:\local_repos\envs\torch_py_39_trt_onnx\lib\site-packages\ray\serve\handle.py", line 524, in _to_object_ref_or_gen_sync
(ServeController pid=89680, ip=10.14.71.23) raise RuntimeError(
(ServeController pid=89680, ip=10.14.71.23) RuntimeError: Sync methods should not be called from within an `asyncio` event loop. Use `await response` or `await response._to_object_ref()` instead.