RuntimError when I trained a custom environment

I’m training a PPO agent by tune.fit() and I met this RunError:

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1 and 2x256)

I had checked my env by check_env.In addtion, I had tried a RandomEnv as this topic said and it works well.So there must be some problems in my custom env.Any advices would be appreciated.
Here are the env:

import gym
from gym import Env
from gym import spaces
from copy import deepcopy
import numpy as np

def EachReward(
        current_state,
        rows,
        cols):
    p = np.random.uniform(0, 1, size=(rows, cols))
    current_reward = -p[current_state[0], current_state[1]]

    return current_reward

env_config = {
    "action_space": gym.spaces.Discrete(2),
    "observation_space": gym.spaces.Box(low=np.array([0, 0]), high=np.array([5, 5]))
}
class PathPlanning(Env):
    metadata = {}

    def __init__(self, config=None):
        if config is None:
            config = env_config
        self.rows = 5
        self.cols = 5
        self.start = (0, 0)
        self.goal = (4, 4)
        self.current_state = self.start
        self.current_step = 0
        self.action_space = config.get("action_space", gym.spaces.Discrete(3))
        self.observation_space = config.get("observation_space",spaces.Box(low=np.array([0, 0]), high=np.array([self.rows, self.cols])))

    def step(self,
             action):
        global reward
        new_state = np.array(deepcopy(self.current_state))
        self.current_step += 1
        if action == 0:  # up
            new_state[0] = new_state[0] - 1
        elif action == 1:  # forward
            new_state[1] = new_state[1] + 1
        elif action == 2:  # down
            new_state[0] = new_state[0] + 1
        else:
            raise Exception("Invalid action")
        self.current_state = new_state

        done = False
        if self.current_step > 30:
            done = True
            self.current_step = 0
        elif new_state[0] > self.rows - 1 or new_state[0] < 0 or new_state[1] > self.cols - 1:
            reward = -10
            self.current_state = self.start
        elif (self.current_state[1] - self.goal[1]) ** 2 + (self.current_state[0] - self.goal[0]) ** 2 == 0:
            reward = 10
            done = True
        else:
            reward = EachReward(current_state=self.current_state, rows=self.rows, cols=self.cols)

        info = {}
        obs = self.current_state
        return obs, reward, done, info

    def render(self):
        pass

    def reset(self):
        self.current_state = self.start
        self.observation = np.array(self.current_state)
        return self.observation

And here are the codes in train.py:

import ray
from ray.rllib.algorithms import ppo
from envs.my_env import PathPlanning, env_config
from ray.tune import register_env


def env_creator(config):
    return PathPlanning(config)


register_env("PathPlanning", env_creator)
ray.init()

ppo_config = ppo.PPOConfig()
ppo_config.environment(env="PathPlanning", env_config=env_config)
ppo_config.framework(framework="torch")
ppo_config.debugging(seed=415, log_level="ERROR")
ppo_config.evaluation(
    evaluation_interval=15,
    evaluation_duration=5,
    evaluation_num_workers=2,
    evaluation_parallel_to_training=True,
    evaluation_config=dict(
        explore=False,
        num_workers=2,
    ),
)
ppo_config.rollouts(num_rollout_workers=2,
                    num_envs_per_worker=1)
ppo_config.experimental(_disable_preprocessor_api=True)
ppo_algo = ppo_config.build()

ppo_config.training(lr=ray.tune.grid_search([5e-5, 2e-5]),
                    train_batch_size=ray.tune.grid_search([128, 256]))

stop = dict(
    timesteps_total=1,
    trainning_iteration=1
)
tuner = ray.tune.Tuner(
    ppo_config.algo_class,

    param_space=ppo_config.to_dict(),

    run_config=ray.air.RunConfig(
        local_dir="my_Tune_logs",
        stop=stop,
        verbose=3,
    )
)

experiment_results = tuner.fit()