I’ve launched about 12000 tasks (each requesting 1 CPU each) on a ray cluster on GCP with preemptible VMs. It seems that when tasks. are getting done and resources gets freed, I still get IDLE processes and the CPU usage keeps going down instead of going up (still l got a. lot of tasks to do). Again I have over 12000 tasks and CPU usage is only at 492.0 CPU. I’ve setup the max_workers to up to 20 each.
When CPU usage gets low (around 150, it goes back up a bit, but still very slow). Not sure why it keeps saying (no resource demands)
there are still thousands in the queue.
Am I doing something wrong?
2021-04-27 18:04:57,118 INFO autoscaler.py:309 --
======== Autoscaler status: 2021-04-27 18:04:57.118407 ========
Node status
---------------------------------------------------------------
Healthy:
1 ray_head_default
4 ray_worker_64
3 ray_worker_80
Pending:
(no pending nodes)
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
357.0/492.0 CPU
0.00/1385.210 GiB memory
0.00/595.004 GiB object_store_memory
Demands:
(no resource demands)
2021-04-27 18:05:03,826 INFO autoscaler.py:309 --
======== Autoscaler status: 2021-04-27 18:05:03.825992 ========
Node status
---------------------------------------------------------------
Healthy:
1 ray_head_default
4 ray_worker_64
3 ray_worker_80
Pending:
(no pending nodes)
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
354.0/492.0 CPU
0.00/1385.210 GiB memory
0.00/595.004 GiB object_store_memory
Demands:
(no resource demands)
2021-04-27 18:05:10,553 INFO autoscaler.py:309 --
======== Autoscaler status: 2021-04-27 18:05:10.552934 ========
Node status
---------------------------------------------------------------
Healthy:
1 ray_head_default
4 ray_worker_64
3 ray_worker_80
Pending:
(no pending nodes)
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
349.0/492.0 CPU
0.00/1385.210 GiB memory
0.00/595.004 GiB object_store_memory
Demands:
(no resource demands)
>>> ray.cluster_resources()
{'memory': 1487358432055.0, 'node:10.30.30.47': 1.0, 'CPU': 492.0, 'object_store_memory': 638881062909.0, 'node:10.30.30.38': 1.0, 'node:10.30.30.46': 1.0, 'node:10.30.30.44': 1.0, 'node:10.30.30.78': 1.0, 'node:10.30.30.77': 1.0, 'node:10.30.30.50': 1.0, 'node:10.30.30.51': 1.0}
>>> ray.available_resources()
{'node:10.30.30.78': 1.0, 'object_store_memory': 638881062909.0, 'memory': 1487358432055.0, 'node:10.30.30.77': 1.0, 'node:10.30.30.47': 1.0, 'CPU': 146.0, 'node:10.30.30.50': 1.0, 'node:10.30.30.51': 1.0, 'node:10.30.30.38': 1.0, 'node:10.30.30.44': 1.0, 'node:10.30.30.46': 1.0}
>>>
Then at some point in the monitor.log
, I’ll hit this: StandardAutoscaler: Too many errors, abort.
2021-04-27 20:46:23,519 ERROR autoscaler.py:142 -- StandardAutoscaler: Error during autoscaling.
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/autoscaler.py", line 140, in update
self._update()
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/autoscaler.py", line 164, in _update
nodes = self.workers()
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/autoscaler.py", line 723, in workers
return self.provider.non_terminated_nodes(
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/gcp/node_provider.py", line 83, in non_terminated_nodes
response = self.compute.instances().list(
File "/usr/local/lib/python3.8/site-packages/googleapiclient/_helpers.py", line 134, in positional_wrapper
return wrapped(*args, **kwargs)
File "/usr/local/lib/python3.8/site-packages/googleapiclient/http.py", line 920, in execute
resp, content = _retry_request(
File "/usr/local/lib/python3.8/site-packages/googleapiclient/http.py", line 222, in _retry_request
raise exception
File "/usr/local/lib/python3.8/site-packages/googleapiclient/http.py", line 191, in _retry_request
resp, content = http.request(uri, method, *args, **kwargs)
File "/usr/local/lib/python3.8/site-packages/google_auth_httplib2.py", line 209, in request
self.credentials.before_request(self._request, method, uri, request_headers)
File "/usr/local/lib/python3.8/site-packages/google/auth/credentials.py", line 133, in before_request
self.refresh(request)
File "/usr/local/lib/python3.8/site-packages/google/auth/compute_engine/credentials.py", line 111, in refresh
self._retrieve_info(request)
File "/usr/local/lib/python3.8/site-packages/google/auth/compute_engine/credentials.py", line 87, in _retrieve_info
info = _metadata.get_service_account_info(
File "/usr/local/lib/python3.8/site-packages/google/auth/compute_engine/_metadata.py", line 234, in get_service_account_info
return get(request, path, params={"recursive": "true"})
File "/usr/local/lib/python3.8/site-packages/google/auth/compute_engine/_metadata.py", line 150, in get
response = request(url=url, method="GET", headers=_METADATA_HEADERS)
File "/usr/local/lib/python3.8/site-packages/google_auth_httplib2.py", line 119, in __call__
response, data = self.http.request(
File "/usr/local/lib/python3.8/site-packages/httplib2/__init__.py", line 1708, in request
(response, content) = self._request(
File "/usr/local/lib/python3.8/site-packages/httplib2/__init__.py", line 1424, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "/usr/local/lib/python3.8/site-packages/httplib2/__init__.py", line 1347, in _conn_request
conn.request(method, request_uri, body, headers)
File "/usr/local/lib/python3.8/http/client.py", line 1255, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/local/lib/python3.8/http/client.py", line 1301, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/usr/local/lib/python3.8/http/client.py", line 1250, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/usr/local/lib/python3.8/http/client.py", line 1010, in _send_output
self.send(msg)
File "/usr/local/lib/python3.8/http/client.py", line 971, in send
self.sock.sendall(data)
BrokenPipeError: [Errno 32] Broken pipe
2021-04-27 20:46:23,520 CRITICAL autoscaler.py:152 -- StandardAutoscaler: Too many errors, abort.
2021-04-27 20:46:23,520 ERROR monitor.py:253 -- Error in monitor loop
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/ray/_private/monitor.py", line 284, in run
self._run()
File "/usr/local/lib/python3.8/site-packages/ray/_private/monitor.py", line 187, in _run
self.autoscaler.update()
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/autoscaler.py", line 154, in update
raise e
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/autoscaler.py", line 140, in update
self._update()
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/autoscaler.py", line 164, in _update
nodes = self.workers()
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/autoscaler.py", line 723, in workers
return self.provider.non_terminated_nodes(
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/gcp/node_provider.py", line 83, in non_terminated_nodes
response = self.compute.instances().list(
File "/usr/local/lib/python3.8/site-packages/googleapiclient/_helpers.py", line 134, in positional_wrapper
return wrapped(*args, **kwargs)
File "/usr/local/lib/python3.8/site-packages/googleapiclient/http.py", line 920, in execute
resp, content = _retry_request(
File "/usr/local/lib/python3.8/site-packages/googleapiclient/http.py", line 222, in _retry_request
raise exception
File "/usr/local/lib/python3.8/site-packages/googleapiclient/http.py", line 191, in _retry_request
resp, content = http.request(uri, method, *args, **kwargs)
File "/usr/local/lib/python3.8/site-packages/google_auth_httplib2.py", line 209, in request
self.credentials.before_request(self._request, method, uri, request_headers)
File "/usr/local/lib/python3.8/site-packages/google/auth/credentials.py", line 133, in before_request
self.refresh(request)
File "/usr/local/lib/python3.8/site-packages/google/auth/compute_engine/credentials.py", line 111, in refresh
self._retrieve_info(request)
File "/usr/local/lib/python3.8/site-packages/google/auth/compute_engine/credentials.py", line 87, in _retrieve_info
info = _metadata.get_service_account_info(
File "/usr/local/lib/python3.8/site-packages/google/auth/compute_engine/_metadata.py", line 234, in get_service_account_info
return get(request, path, params={"recursive": "true"})
File "/usr/local/lib/python3.8/site-packages/google/auth/compute_engine/_metadata.py", line 150, in get
response = request(url=url, method="GET", headers=_METADATA_HEADERS)
File "/usr/local/lib/python3.8/site-packages/google_auth_httplib2.py", line 119, in __call__
response, data = self.http.request(
File "/usr/local/lib/python3.8/site-packages/httplib2/__init__.py", line 1708, in request
(response, content) = self._request(
File "/usr/local/lib/python3.8/site-packages/httplib2/__init__.py", line 1424, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "/usr/local/lib/python3.8/site-packages/httplib2/__init__.py", line 1347, in _conn_request
conn.request(method, request_uri, body, headers)
File "/usr/local/lib/python3.8/http/client.py", line 1255, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/local/lib/python3.8/http/client.py", line 1301, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/usr/local/lib/python3.8/http/client.py", line 1250, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/usr/local/lib/python3.8/http/client.py", line 1010, in _send_output
self.send(msg)
File "/usr/local/lib/python3.8/http/client.py", line 971, in send
self.sock.sendall(data)
BrokenPipeError: [Errno 32] Broken pipe