TFActionDistribution Incompatible shapes

Hi! First time using RLLib, so hard to navigate among all abstractions.

My error:

File "/Users/etrapeznikov/Desktop/eugene/tech/buro/venv/lib/python3.10/site-packages/ray/rllib/policy/policy.py", line 1513, in _initialize_loss_from_dummy_batch
    self.loss(self.model, self.dist_class, train_batch)
  File "/Users/etrapeznikov/Desktop/eugene/tech/buro/venv/lib/python3.10/site-packages/ray/rllib/algorithms/ppo/ppo_tf_policy.py", line 142, in loss
    curr_action_dist.logp(train_batch[SampleBatch.ACTIONS])
  File "/Users/etrapeznikov/Desktop/eugene/tech/buro/venv/lib/python3.10/site-packages/ray/rllib/models/tf/tf_action_dist.py", line 357, in logp
    tf.math.square((tf.cast(x, tf.float32) - self.mean) / self.std), axis=1
  File "/Users/etrapeznikov/Desktop/eugene/tech/buro/venv/lib/python3.10/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/Users/etrapeznikov/Desktop/eugene/tech/buro/venv/lib/python3.10/site-packages/tensorflow/python/framework/ops.py", line 7262, in raise_from_not_ok_status
    raise core._status_to_exception(e) from None  # pylint: disable=protected-access
tensorflow.python.framework.errors_impl.InvalidArgumentError: {{function_node __wrapped__Sub_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes: [32,2] vs. [32,32] [Op:Sub]

Looks like the problem is in action space. [32,2] looks reasonable. 32 is a batch and 2 is number of actions in my PPO env. But how did it get [32,32]?

Trying to post all meaningful code:

Agent:

class MarketMakerCPPO(object):
    name = 'MarketMakerCPPO'

    def __init__(
            self,
            number_of_training_steps=1e5,
            load_weights=False,
            visualize=False,
            model_file: str = "",
            testing_files: list = [""],
            action_repeats: int = 5,
            model_report_path: str = "",
            **kwargs):

        # Init RLLib agent
        ModelCatalog.register_custom_model(
            NZTModelV1.name, NZTModelV1
        )

        ppo_config = (
            PPOConfig()
            .resources(num_gpus=0)
            .rollouts(num_rollout_workers=0)  # 0 - local
            .environment(MarketMaker, env_config=kwargs)
            .framework("tf2")
            .rl_module(
                _enable_rl_module_api=False,  # required to use custom model
            )
            .training(
                _enable_learner_api=False,  # required to use custom model
                train_batch_size=1024,  # try different values
                lr_schedule=[[0, 2e-5], [1e6, 5e-6]],  # [timestep, lr], this is from Nagy/Zohren paper
                model={
                    "custom_model": NZTModelV1.name,
                    "custom_model_config": {
                        "lob_model_path": model_file,
                    }
                },
            )
        )
        ray.init(ignore_reinit_error=True)

        self.agent = PPO(config=ppo_config)

    def start(self) -> None:
        """
        Entry point for agent training and testing
        """
        done = False

        while not done:
            result = self.agent.train()
            done = result["done"]
            self.agent.save('../models/ppo_model_agent_v1')

Env:

class MarketMaker(BaseEnvironment[float]):
    id = 'market-maker-v0'
    description = "Environment where limit orders are tethered to LOB price levels"

    def __init__(self, config):
        super().__init__(config)

        self.actions = np.zeros(2, dtype=np.float32)

        self.action_space = gym.spaces.Box(low=0., high=1.,
                                           shape=self.actions.shape,
                                           dtype=np.float32)
        self.observation_shape = (50, 68, 1)
        self.observation_space = gym.spaces.Box(low=-10., high=10.,
                                                shape=self.observation_shape,
                                                dtype=np.float32)

And one more question:
Would self.agent.train() call train_batch_size times step of env?