Thanks for your prompt reply.
I used ray.init(local_mode=True)
to run Ray in local mode but it doesn’t solve the issue. I noticed it is called within build() before I start training. I also could not succeed in breaking the code execution of the cartpole_ppo example inside get_action_dist()
, I know the reason could be exploiting the new API stack.
Currently, I prefer to stick to the old API stack as I’m using custom policy models and the RL module is experimental.
Here, I’m sharing a piece of code that throws the error that I mentioned. It doesn’t start training and crash before that.
RLlib 2.38 and python 3.12.7.
import gym
from gym.spaces import MultiDiscrete, Box
import numpy as np
from ray.rllib.env import EnvContext, MultiAgentEnv
class MultiDiscreteEnv(MultiAgentEnv):
def __init__(self, config: EnvContext):
super().__init__()
# Define a MultiDiscrete action space with two components
# First component ranges from 0-4, and second component from 0-2
self.action_space = MultiDiscrete([5, 3])
# Observation space: an array of size 3 with continuous values
self.observation_space = Box(low=-1.0, high=1.0, shape=(3,), dtype=np.float32)
# Set an initial state for demonstration
self.state = np.zeros(3, dtype=np.float32)
self.step_count = 0
def reset(self):
# Reset the state and step count
self.state = np.zeros(3, dtype=np.float32)
self.step_count = 0
return self.state
def step(self, action):
# Action is a tuple (e.g., (3, 1)) of actions in each discrete component
action_0, action_1 = action
# Simplified reward calculation
reward = 1.0 if action_0 == 3 and action_1 == 1 else 0.0
# Update the state as a function of the action (for demonstration)
self.state = np.array([action_0 / 5.0, action_1 / 2.0, 0.5], dtype=np.float32)
# Increment the step count
self.step_count += 1
done = self.step_count >= 10 # Terminate episode after 10 steps
return self.state, reward, done, {}
# Register this environment with gym
import gym
from ray.tune.registry import register_env
def env_creator(env_config):
return MultiDiscreteEnv(env_config)
register_env("MultiDiscreteEnv-v0", env_creator)
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPO
# Initialize Ray
ray.init(ignore_reinit_error=True, local_mode=True)
# Define the PPO configuration
config = {
"env": "MultiDiscreteEnv-v0",
"framework": "torch", # Use TensorFlow or change to "torch" for PyTorch
"num_gpus": 0, # Change to 1 if you have a GPU available
"num_workers": 1, # Parallelism, set to >1 to use multiple CPU cores
"env_config": {}, # Any environment-specific parameters go here
"model": {
"fcnet_hiddens": [64, 64], # Fully connected layers of the policy network
"fcnet_activation": "relu", # Activation function for each layer
},
"multiagent": {
"policies": {
"default_policy": (None, MultiDiscreteEnv({}).observation_space, MultiDiscreteEnv({}).action_space, {})
},
"policy_mapping_fn": lambda agent_id: "default_policy",
},
"rollout_fragment_length": 200,
"train_batch_size": 4000,
"sgd_minibatch_size": 128,
"num_sgd_iter": 10,
"lr": 5e-4, # Learning rate
"gamma": 0.99, # Discount factor
}
trainer = PPO(config=config)
# Run the PPO Trainer
tune.run(
trainer,
config=config,
stop={"episode_reward_mean": 5}, # Stop criteria for demonstration
local_dir="./ray_results", # Directory to save results
)