How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
Recently, when trying to run the method ray.tune.run on Google Colab for hyperparameter tuning (in this case of a PyTorch network), a ValueError: invalid literal for int() with base 10: 'max\n'
appears. This has been the case for a few hours now, previously there was no problem at all. I have this error in my private notebook, but I think it is a general problem, since this tutorial notebook of PyTorch suffers from the same error. (Don’t forget to pip install ray if you want to reproduce on that notebook.)
The call for which the error appears is just any run call, for example in the case of my own project:
result = tune.run(
tune.with_parameters(train,train_ds=train_ds,val_ds=val_ds),
resources_per_trial={"cpu": 0, "gpu": 1},
config=config,
num_samples=num_samples,
scheduler=scheduler,
progress_reporter=reporter)
The complete stack trace:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-10-9e0de7fcc159> in <module>()
32 num_samples=num_samples,
33 scheduler=scheduler,
---> 34 progress_reporter=reporter)
35
8 frames
/usr/local/lib/python3.7/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, max_concurrent_trials, _experiment_checkpoint_dir, queue_trials, loggers, _remote)
346
347 if not trial_executor or isinstance(trial_executor, RayTrialExecutor):
--> 348 _ray_auto_init()
349
350 if _remote:
/usr/local/lib/python3.7/dist-packages/ray/tune/tune.py in _ray_auto_init()
839 "call `ray.init(...)` before `tune.run`."
840 )
--> 841 ray.init()
/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py in wrapper(*args, **kwargs)
103 if func.__name__ != "init" or is_client_mode_enabled_by_default:
104 return getattr(ray, func.__name__)(*args, **kwargs)
--> 105 return func(*args, **kwargs)
106
107 return wrapper
/usr/local/lib/python3.7/dist-packages/ray/worker.py in init(address, num_cpus, num_gpus, resources, object_store_memory, local_mode, ignore_reinit_error, include_dashboard, dashboard_host, dashboard_port, job_config, configure_logging, logging_level, logging_format, log_to_driver, namespace, runtime_env, storage, _enable_object_reconstruction, _redis_max_memory, _plasma_directory, _node_ip_address, _driver_object_store_memory, _memory, _redis_password, _temp_dir, _metrics_export_port, _system_config, _tracing_startup_hook, **kwargs)
1021 # isn't called.
1022 _global_node = ray.node.Node(
-> 1023 head=True, shutdown_at_exit=False, spawn_reaper=True, ray_params=ray_params
1024 )
1025 else:
/usr/local/lib/python3.7/dist-packages/ray/node.py in __init__(self, ray_params, head, shutdown_at_exit, spawn_reaper, connect_only)
299
300 if not connect_only:
--> 301 self.start_ray_processes()
302 # we should update the address info after the node has been started
303 try:
/usr/local/lib/python3.7/dist-packages/ray/node.py in start_ray_processes(self)
1128 # Make sure we don't call `determine_plasma_store_config` multiple
1129 # times to avoid printing multiple warnings.
-> 1130 resource_spec = self.get_resource_spec()
1131 (
1132 plasma_directory,
/usr/local/lib/python3.7/dist-packages/ray/node.py in get_resource_spec(self)
479 resources,
480 self._ray_params.redis_max_memory,
--> 481 ).resolve(is_head=self.head, node_ip_address=self.node_ip_address)
482 return self._resource_spec
483
/usr/local/lib/python3.7/dist-packages/ray/_private/resource_spec.py in resolve(self, is_head, node_ip_address)
195
196 # Choose a default object store size.
--> 197 system_memory = ray._private.utils.get_system_memory()
198 avail_memory = ray._private.utils.estimate_available_memory()
199 object_store_memory = self.object_store_memory
/usr/local/lib/python3.7/dist-packages/ray/_private/utils.py in get_system_memory()
434 elif os.path.exists(memory_limit_filename_v2):
435 with open(memory_limit_filename_v2, "r") as f:
--> 436 docker_limit = int(f.read())
437
438 # Use psutil if it is available.
ValueError: invalid literal for int() with base 10: 'max\n'
How can I fix or work around this problem?