How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
Hi,
I’ve been using RLlib in a multi-agent CARLA environment (similar to this implementation) which crashes from time to time due to memory issues, and I would like to resume the training after each crash. The training script and the config file are as follows:
import os
import ray
import yaml
import time
import argparse
from tensorboard import program
from ray import air, tune
from ray.tune.registry import register_env
from carla_env import CarlaEnv
argparser = argparse.ArgumentParser(description='CoPeRL Training Implementation.')
argparser.add_argument('config', help='configuration file')
argparser.add_argument('-d', '--directory',
metavar='D',
default='/home/coperl/ray_results',
help='directory to save the results (default: /home/coperl/ray_results)')
argparser.add_argument('-n', '--name',
metavar='N',
default='sac_experiment',
help='name of the experiment (default: sac_experiment)')
argparser.add_argument('--restore',
action='store_true',
default=False,
help='restore the specified experiment (default: False)')
argparser.add_argument('--tb',
action='store_true',
default=False,
help='activate tensorboard (default: False)')
args = argparser.parse_args()
def parse_config(args):
'''
Parse the configuration file.
Args:
args: command line arguments.
Return:
config: configuration dictionary.
'''
with open(args.config) as f:
config = yaml.load(f, Loader=yaml.FullLoader)
return config
def launch_tensorboard(logdir, host='localhost', port='6006'):
'''
Launch TensorBoard.
Args:
logdir: directory of the saved results.
host: host address.
port: port number.
Return:
'''
tb = program.TensorBoard()
tb.configure(argv=[None, '--logdir', logdir, '--host', host, '--port', port])
url = tb.launch()
def env_creator(env_config):
'''
Create Gymnasium-like environment.
Args:
env_config: configuration passed to the environment.
Return:
env: environment object.
'''
return CarlaEnv(env_config)
def run(args):
'''
Run Ray Tuner.
Args:
args: command line arguments.
Return:
'''
try:
ray.init(num_cpus=12, num_gpus=2)
register_env('carla', env_creator)
os.system('nvidia-smi')
if not args.restore:
tuner = tune.Tuner(
'SAC',
run_config=air.RunConfig(
name=args.name,
storage_path=args.directory,
checkpoint_config=air.CheckpointConfig(
num_to_keep=2,
checkpoint_frequency=1,
checkpoint_at_end=True
),
stop={'training_iteration': 8192},
verbose=2
),
param_space=args.config,
)
else:
tuner = tune.Tuner.restore(os.path.join(args.directory, args.name), 'SAC', resume_errored=True)
result = tuner.fit().get_best_result()
print(result)
except Exception as e:
print(e)
finally:
ray.shutdown()
time.sleep(10.0)
def main():
args.config = parse_config(args)
if args.tb:
launch_tensorboard(logdir=os.path.join(args.directory, args.name))
run(args)
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
ray.shutdown()
finally:
print('Done.')
framework: 'torch'
env: 'carla'
disable_env_checking: True
num_workers: 1
num_gpus: 1
num_cpus_per_worker: 8
num_gpus_per_worker: 1
train_batch_size: 256
log_level: 'DEBUG'
ignore_worker_failures: True
restart_failed_sub_environments: False
checkpoint_at_end: True
export_native_model_files: True
keep_per_episode_custom_metrics: True
q_model_config:
fcnet_hiddens: [256, 256]
dim: 200
conv_filters: [
[16, [3, 3], 2],
[32, [3, 3], 2],
[32, [3, 3], 2],
[64, [3, 3], 2],
[64, [3, 3], 2],
[128, [3, 3], 2]
]
post_fcnet_hiddens: [256]
policy_model_config:
fcnet_hiddens: [256, 256]
dim: 200
conv_filters: [
[16, [3, 3], 2],
[32, [3, 3], 2],
[32, [3, 3], 2],
[64, [3, 3], 2],
[64, [3, 3], 2],
[128, [3, 3], 2]
]
post_fcnet_hiddens: [256]
However, most parameters seem to get reset when I use Tuner.restore
after a crash, apart from a few like the number of steps, as shown in the following figure (same thing for TD error, mean and max Q, episode reward, etc.)
I tried what was suggested here and it seems that neither the get_weights
nor the set_weights
functions get accessed. I also tried the algo.save(), Algorithm.from_checkpoint()
method and got similar results. I would appreciate it if someone could let me know where I’m going wrong.