Hello,
I’m trying to understand how to make ray serve rollback to previous version if current one failed.
Here is a minimal example :
from time import sleep
import ray
from fastapi import FastAPI
from ray import serve
from ray.serve import list_deployments
ray.init()
serve.start()
app = FastAPI()
@serve.deployment(num_replicas=2)
@serve.ingress(app)
class Server:
def __init__(self, crash=False):
if crash:
raise RuntimeError("THIS IS AN ERROR ERROR")
@app.get("/")
def get(self):
return "hi"
print("DEPLOY VERSION 1")
Server.options(version="11").deploy()
sleep(1)
Server.options(version="23", prev_version="11").deploy(crash=True)
while True:
sleep(1)
print(list_deployments())
In the console, I get the message :
(ServeController pid=39183) 2022-01-18 12:31:45,202 INFO checkpoint_path.py:16 -- Using RayInternalKVStore for controller checkpoint and recovery.
(ServeController pid=39183) 2022-01-18 12:31:45,311 INFO http_state.py:101 -- Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:drOYbh:SERVE_PROXY_ACTOR-node:127.0.0.1-0' on node 'node:127.0.0.1-0' listening on '127.0.0.1:8000'
2022-01-18 12:31:45,803 INFO api.py:475 -- Started Serve instance in namespace '41bda91b-46a0-40ff-8ca8-aebf0948b767'.
2022-01-18 12:31:45,821 INFO api.py:249 -- Updating deployment 'Server' to version '11'. component=serve deployment=Server
DEPLOY VERSION 1
(ServeController pid=39183) 2022-01-18 12:31:45,861 INFO deployment_state.py:919 -- Adding 2 replicas to deployment 'Server'. component=serve deployment=Server
(HTTPProxyActor pid=39179) INFO: Started server process [39179]
2022-01-18 12:31:46,541 INFO api.py:262 -- Deployment 'Server:11' is ready at `http://127.0.0.1:8000/Server`. component=serve deployment=Server
2022-01-18 12:31:47,554 INFO api.py:249 -- Updating deployment 'Server' to version '23'. component=serve deployment=Server
(ServeController pid=39183) 2022-01-18 12:31:47,644 INFO deployment_state.py:881 -- Stopping 1 replicas of deployment 'Server' with outdated versions. component=serve deployment=Server
(ServeController pid=39183) 2022-01-18 12:31:50,329 ERROR deployment_state.py:284 -- Exception in deployment 'Server'
(ServeController pid=39183) 2022-01-18 12:31:55,857 INFO deployment_state.py:919 -- Adding 1 replicas to deployment 'Server'. component=serve deployment=Server
Traceback (most recent call last):
File "my_file.py", line 34, in <module>
Server.options(version="23", prev_version="11").deploy(crash=True)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/serve/api.py", line 817, in deploy
_blocking=_blocking)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/serve/api.py", line 93, in check
return f(self, *args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/serve/api.py", line 255, in deploy
self._wait_for_goal(goal_id)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/serve/api.py", line 182, in _wait_for_goal
async_goal_exception = ray.get(ready)[0]
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
return func(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/worker.py", line 1741, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::ServeController.listen_for_change() (pid=39183, ip=127.0.0.1)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/serve/utils.py", line 241, in wrap_to_ray_error
raise exception
RuntimeError: Failed to reach deployment goal. Check the serve logs for details.
(ServeController pid=39183) 2022-01-18 12:31:58,082 ERROR deployment_state.py:284 -- Exception in deployment 'Server'
(ServeController pid=39183) Traceback (most recent call last):
(ServeController pid=39183) File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/serve/deployment_state.py", line 277, in check_ready
(ServeController pid=39183) deployment_config, version = ray.get(ready)[0]
(ServeController pid=39183) File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
(ServeController pid=39183) return func(*args, **kwargs)
(ServeController pid=39183) File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/worker.py", line 1741, in get
(ServeController pid=39183) raise value.as_instanceof_cause()
(ServeController pid=39183) ray.exceptions.RayTaskError(RuntimeError): ray::RayServeWrappedReplica.reconfigure() (pid=39217, ip=127.0.0.1, repr=<ray.serve.replica.Server object at 0x7f9e40f7ea10>)
(ServeController pid=39183) File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/concurrent/futures/_base.py", line 428, in result
(ServeController pid=39183) return self.__get_result()
(ServeController pid=39183) File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
(ServeController pid=39183) raise self._exception
(ServeController pid=39183) File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/serve/replica.py", line 143, in reconfigure
(ServeController pid=39183) await self._initialize_replica()
(ServeController pid=39183) File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/serve/replica.py", line 92, in initialize_replica
(ServeController pid=39183) **init_kwargs)
(ServeController pid=39183) File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ray/serve/api.py", line 593, in __init__
(ServeController pid=39183) super().__init__(*args, **kwargs)
(ServeController pid=39183) File "my_file.py", line 20, in __init__
(ServeController pid=39183) raise RuntimeError("THIS IS AN ERROR ERROR")
(ServeController pid=39183) RuntimeError: THIS IS AN ERROR ERROR
(ServeController pid=39183) 2022-01-18 12:31:58,085 INFO deployment_state.py:1201 -- Updating deployment 'Server' failed, rolling back to version 11. component=serve deployment=Server
(ServeController pid=39183) 2022-01-18 12:31:58,195 INFO deployment_state.py:939 -- Removing 1 replicas from deployment 'Server'. component=serve deployment=Server
Process finished with exit code 1
The line Updating deployment 'Server' failed, rolling back to version 11. component=serve deployment=Server
would let me believe that my deployment is going to rollback to previous version, but it does not seem to be the case.
Any hint ?
Thank you in advance