I’m training a PPO agent by tune.fit() and I met this RunError:
RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1 and 2x256)
I had checked my env by check_env
.In addtion, I had tried a RandomEnv as this topic said and it works well.So there must be some problems in my custom env.Any advices would be appreciated.
Here are the env:
import gym
from gym import Env
from gym import spaces
from copy import deepcopy
import numpy as np
def EachReward(
current_state,
rows,
cols):
p = np.random.uniform(0, 1, size=(rows, cols))
current_reward = -p[current_state[0], current_state[1]]
return current_reward
env_config = {
"action_space": gym.spaces.Discrete(2),
"observation_space": gym.spaces.Box(low=np.array([0, 0]), high=np.array([5, 5]))
}
class PathPlanning(Env):
metadata = {}
def __init__(self, config=None):
if config is None:
config = env_config
self.rows = 5
self.cols = 5
self.start = (0, 0)
self.goal = (4, 4)
self.current_state = self.start
self.current_step = 0
self.action_space = config.get("action_space", gym.spaces.Discrete(3))
self.observation_space = config.get("observation_space",spaces.Box(low=np.array([0, 0]), high=np.array([self.rows, self.cols])))
def step(self,
action):
global reward
new_state = np.array(deepcopy(self.current_state))
self.current_step += 1
if action == 0: # up
new_state[0] = new_state[0] - 1
elif action == 1: # forward
new_state[1] = new_state[1] + 1
elif action == 2: # down
new_state[0] = new_state[0] + 1
else:
raise Exception("Invalid action")
self.current_state = new_state
done = False
if self.current_step > 30:
done = True
self.current_step = 0
elif new_state[0] > self.rows - 1 or new_state[0] < 0 or new_state[1] > self.cols - 1:
reward = -10
self.current_state = self.start
elif (self.current_state[1] - self.goal[1]) ** 2 + (self.current_state[0] - self.goal[0]) ** 2 == 0:
reward = 10
done = True
else:
reward = EachReward(current_state=self.current_state, rows=self.rows, cols=self.cols)
info = {}
obs = self.current_state
return obs, reward, done, info
def render(self):
pass
def reset(self):
self.current_state = self.start
self.observation = np.array(self.current_state)
return self.observation
And here are the codes in train.py:
import ray
from ray.rllib.algorithms import ppo
from envs.my_env import PathPlanning, env_config
from ray.tune import register_env
def env_creator(config):
return PathPlanning(config)
register_env("PathPlanning", env_creator)
ray.init()
ppo_config = ppo.PPOConfig()
ppo_config.environment(env="PathPlanning", env_config=env_config)
ppo_config.framework(framework="torch")
ppo_config.debugging(seed=415, log_level="ERROR")
ppo_config.evaluation(
evaluation_interval=15,
evaluation_duration=5,
evaluation_num_workers=2,
evaluation_parallel_to_training=True,
evaluation_config=dict(
explore=False,
num_workers=2,
),
)
ppo_config.rollouts(num_rollout_workers=2,
num_envs_per_worker=1)
ppo_config.experimental(_disable_preprocessor_api=True)
ppo_algo = ppo_config.build()
ppo_config.training(lr=ray.tune.grid_search([5e-5, 2e-5]),
train_batch_size=ray.tune.grid_search([128, 256]))
stop = dict(
timesteps_total=1,
trainning_iteration=1
)
tuner = ray.tune.Tuner(
ppo_config.algo_class,
param_space=ppo_config.to_dict(),
run_config=ray.air.RunConfig(
local_dir="my_Tune_logs",
stop=stop,
verbose=3,
)
)
experiment_results = tuner.fit()