Google Colab: invalid literal for int() with base 10: 'max\n'

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

Recently, when trying to run the method ray.tune.run on Google Colab for hyperparameter tuning (in this case of a PyTorch network), a ValueError: invalid literal for int() with base 10: 'max\n' appears. This has been the case for a few hours now, previously there was no problem at all. I have this error in my private notebook, but I think it is a general problem, since this tutorial notebook of PyTorch suffers from the same error. (Don’t forget to pip install ray if you want to reproduce on that notebook.)

The call for which the error appears is just any run call, for example in the case of my own project:

result = tune.run(
    tune.with_parameters(train,train_ds=train_ds,val_ds=val_ds),
    resources_per_trial={"cpu": 0, "gpu": 1},
    config=config,
    num_samples=num_samples,
    scheduler=scheduler,
    progress_reporter=reporter)

The complete stack trace:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-10-9e0de7fcc159> in <module>()
     32     num_samples=num_samples,
     33     scheduler=scheduler,
---> 34     progress_reporter=reporter)
     35 

8 frames
/usr/local/lib/python3.7/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, max_concurrent_trials, _experiment_checkpoint_dir, queue_trials, loggers, _remote)
    346 
    347     if not trial_executor or isinstance(trial_executor, RayTrialExecutor):
--> 348         _ray_auto_init()
    349 
    350     if _remote:

/usr/local/lib/python3.7/dist-packages/ray/tune/tune.py in _ray_auto_init()
    839             "call `ray.init(...)` before `tune.run`."
    840         )
--> 841         ray.init()

/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py in wrapper(*args, **kwargs)
    103             if func.__name__ != "init" or is_client_mode_enabled_by_default:
    104                 return getattr(ray, func.__name__)(*args, **kwargs)
--> 105         return func(*args, **kwargs)
    106 
    107     return wrapper

/usr/local/lib/python3.7/dist-packages/ray/worker.py in init(address, num_cpus, num_gpus, resources, object_store_memory, local_mode, ignore_reinit_error, include_dashboard, dashboard_host, dashboard_port, job_config, configure_logging, logging_level, logging_format, log_to_driver, namespace, runtime_env, storage, _enable_object_reconstruction, _redis_max_memory, _plasma_directory, _node_ip_address, _driver_object_store_memory, _memory, _redis_password, _temp_dir, _metrics_export_port, _system_config, _tracing_startup_hook, **kwargs)
   1021         # isn't called.
   1022         _global_node = ray.node.Node(
-> 1023             head=True, shutdown_at_exit=False, spawn_reaper=True, ray_params=ray_params
   1024         )
   1025     else:

/usr/local/lib/python3.7/dist-packages/ray/node.py in __init__(self, ray_params, head, shutdown_at_exit, spawn_reaper, connect_only)
    299 
    300         if not connect_only:
--> 301             self.start_ray_processes()
    302             # we should update the address info after the node has been started
    303             try:

/usr/local/lib/python3.7/dist-packages/ray/node.py in start_ray_processes(self)
   1128         # Make sure we don't call `determine_plasma_store_config` multiple
   1129         # times to avoid printing multiple warnings.
-> 1130         resource_spec = self.get_resource_spec()
   1131         (
   1132             plasma_directory,

/usr/local/lib/python3.7/dist-packages/ray/node.py in get_resource_spec(self)
    479                 resources,
    480                 self._ray_params.redis_max_memory,
--> 481             ).resolve(is_head=self.head, node_ip_address=self.node_ip_address)
    482         return self._resource_spec
    483 

/usr/local/lib/python3.7/dist-packages/ray/_private/resource_spec.py in resolve(self, is_head, node_ip_address)
    195 
    196         # Choose a default object store size.
--> 197         system_memory = ray._private.utils.get_system_memory()
    198         avail_memory = ray._private.utils.estimate_available_memory()
    199         object_store_memory = self.object_store_memory

/usr/local/lib/python3.7/dist-packages/ray/_private/utils.py in get_system_memory()
    434     elif os.path.exists(memory_limit_filename_v2):
    435         with open(memory_limit_filename_v2, "r") as f:
--> 436             docker_limit = int(f.read())
    437 
    438     # Use psutil if it is available.

ValueError: invalid literal for int() with base 10: 'max\n'

How can I fix or work around this problem?

Hey @victordmz, this problem was introduced in the newer versions of Ray. It should be fixed once [Core] Fix in-container memory limit fetching for cgroups v2 by clarkzinzow · Pull Request #23922 · ray-project/ray · GitHub is merged.

In the meantime you can add and execute this code right before tune.run and this should fix the problem

import psutil
import ray
ray._private.utils.get_system_memory = lambda: psutil.virtual_memory().total