How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
Hi,
I am running into a weird issue (full log below) where I am providing the correctly shaped observation, but I get an error? However, the client and server can still keep communicating. And the training loop completes successfully (not saying there is much progress as I am testing things out). The error only shows up once at the top as well.
Essentially, I just want to double check if this error is legit, or a false positive?
Policy server:
from gym import spaces
import ray
from ray.rllib.agents import with_common_config
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.env import PolicyServerInput
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.examples.env.random_env import RandomEnv
import numpy as np
import argparse
from gymnasium.spaces import MultiDiscrete, Box
ppo_config = PPOConfig()
parser = argparse.ArgumentParser(description='Optional app description')
parser.add_argument('-ip', type=str, help='IP of this device')
parser.add_argument('-checkpoint', type=str, help='location of checkpoint to restore from')
args = parser.parse_args()
def _input(ioctx):
return PolicyServerInput(
ioctx,
args.ip,
55556,
)
x = 320
y = 240
# coef = 0.5
# x = int(x * coef)
# y = int(y * coef)
# ignored:
# kl_coeff, ->
# vf_loss_coeff used to be 0.01??
# "entropy_coeff": 0.00005,
# "clip_param": 0.1,
ppo_config.gamma = 0.998 # default 0.99
ppo_config.lambda_ = 0.99 # default 1.0???
ppo_config.kl_target = 0.01 # used to use 0.02
ppo_config.rollout_fragment_length = 16
ppo_config.train_batch_size = 2560
ppo_config.sgd_minibatch_size = 128
ppo_config.num_sgd_iter = 1 # default 30???
ppo_config.lr = 3.5e-5 # 5e-5
ppo_config.model = {
# Share layers for value function. If you set this to True, it's
# important to tune vf_loss_coeff.
"vf_share_layers": False,
"use_lstm": True,
"max_seq_len": 32,
"lstm_cell_size": 128,
"lstm_use_prev_action": True,
# 'use_attention': True,
# "max_seq_len": 128,
# "attention_num_transformer_units": 1,
# "attention_dim": 1024,
# "attention_memory_inference": 128,
# "attention_memory_training": 128,
# "attention_num_heads": 8,
# "attention_head_dim": 64,
# "attention_position_wise_mlp_dim": 512,
# "attention_use_n_prev_actions": 0,
# "attention_use_n_prev_rewards": 0,
# "attention_init_gru_gate_bias": 2.0,
"conv_filters": [
# [4, [3, 4], [1, 1]],
# [16, [6, 8], [3, 3]],
# [32, [6, 8], [3, 4]],
# [64, [6, 6], 3],
# [256, [9, 9], 1],
# 480 x 640
# [4, [7, 7], [3, 3]],
# [16, [5, 5], [3, 3]],
# [32, [5, 5], [2, 2]],
# [64, [5, 5], [2, 2]],
# [256, [5, 5], [3, 5]],
# 240 X 320
[16, [5, 5], 3],
[32, [5, 5], 3],
[64, [5, 5], 3],
[128, [3, 3], 2],
[256, [3, 3], 2],
[512, [3, 3], 2],
],
"conv_activation": "relu",
"post_fcnet_hiddens": [512],
"post_fcnet_activation": "relu"
}
ppo_config.batch_mode = "complete_episodes"
ppo_config.simple_optimizer = True
ppo_config.num_gpus = 0
# ppo_config.input_ = (
# lambda ioctx: PolicyServerInput(ioctx, args.ip, 55556)
# )
ppo_config.rollouts(num_rollout_workers=0)
ppo_config.offline_data(input_=_input)
ppo_config.env = None
ppo_config.observation_space = Box(low=0, high=1, shape=(y, x, 1), dtype=np.float32)
ppo_config.action_space = MultiDiscrete(
[
2, # W
2, # A
2, # S
2, # D
2, # Space
2, # H
2, # J
2, # K
2 # L
]
)
ppo_config.env_config = {
"sleep": True,
}
ppo_config.framework_str = 'tf'
ppo_config.log_sys_usage = False
ppo_config.compress_observations = True
ppo_config.shuffle_sequences = False
print(ppo_config.to_dict())
tempyy = ppo_config.to_dict()
ray.init(num_cpus=2, num_gpus=0, log_to_driver=False)
trainer = PPOTrainer
from ray import tune
name = "" + args.checkpoint
print(f"Starting: {name}")
tune.run(trainer,
resume='AUTO',
config=ppo_config.to_dict(), name=name, keep_checkpoints_num=None, checkpoint_score_attr="episode_reward_mean",
max_failures=1,
# restore="C:\\Users\\denys\\ray_results\\mediumbrawl-attention-256Att-128MLP-L2\\PPOTrainer_RandomEnv_1e882_00000_0_2022-06-02_15-13-44\\checkpoint_000028\\checkpoint-28",
checkpoint_freq=5, checkpoint_at_end=True)
Policy Client
import os
import cv2
from ray.rllib.env import PolicyClient
from pathlib import Path
from environment import BrawlEnv
import logging
import time
import argparse
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(description='Optional app description')
parser.add_argument('-ip', type=str,
help='IP of this device')
parser.add_argument('-speed', type=float,
help='gameFactor, default 1.0')
parser.add_argument('-update', type=float,
help='seconds how often to update from main process')
parser.add_argument('-local', type=str,
help='Whether to create and update a local copy of the AI (adds delay) or query server for each action.'
'possible values: "local" or "remote"')
args = parser.parse_args()
update = 3600.0
local = 'local'
remoteee = False
if args.update:
update = args.update
# remoteee = True
if args.local:
local = args.local
if local == 'remote':
remoteee = True
print(f"Going to update {local}-y at {update} seconds interval")
print('trying to launch policy client')
print(f"http://{args.ip}:55556")
# Setting update_interval to false, so it doesn't update in middle of games, will be manually updating it between games
client = PolicyClient(address=f"http://{args.ip}:55556", update_interval=False, inference_mode=local)
# client = PolicyClient(address=f"http://{args.ip}:55556", update_interval=60, inference_mode=local)
forced = True
root = None
env = BrawlEnv({'sleep': True})
print('trying to get initial eid')
episode_id = client.start_episode()
# if local == 'remote':
# env.underlord.startNewGame()c
# gameObservation = env.underlord.getObservation()
reward = 0
print('starting main loop')
replayList = []
update = True
runningReward = 0
counter = 0
runningCounter = 0
numLoops = 0
startTime = time.time()
endTime = time.time()
fps = 5
actionTimeOut = 1.0 / fps
print(f"action time: {actionTimeOut}")
actionTime = time.time()
env.restartRound()
x = 320
y = 240
epochActions = 4096
actionsUntilEpoch = 4096
epochNum = 0
needReset = False
numActions = 0
old_id = None
gameTime = time.time()
while True:
# if needReset:
# env.releaseAllKeys()
if numActions % 500 == 0:
env.refreshWindow()
elapsed_time = time.time() - actionTime
if elapsed_time < actionTimeOut:
time.sleep(actionTimeOut - elapsed_time)
# continue
actionTime = time.time()
# average out to ~30actions a second
counter = counter + 1
runningCounter = runningCounter + 1
endTime = time.time()
if (endTime - startTime) > 1:
print(f"actions per second: {counter}")
startTime = time.time()
counter = 0
numLoops = numLoops + 1
# timeStart = time.time()
gameObservation, reward, gameOver = env.getObservation()
# print(f"Time to get obs: {time.time() - timeStart}")
# print('got observation')
# print(gameObservation)
# print(env.observation_space.contains(gameObservation))
# print(reward, gameOver)
# if not env.observation_space.contains(gameObservation):
# print(gameObservation)
# print("Not lined up 1")
# print(env.underlord.heroAlliances)
# sys.exit()
action = None
# timeStart = time.time()
action = client.get_action(episode_id=episode_id, observation=gameObservation)
# print(f"Time to get action: {time.time() - timeStart}")
if needReset:
print('starting reset!')
if local == 'local':
print("updating weights")
client.update_policy_weights()
print('finished updating weights')
time.sleep(0.25)
env.refreshWindow()
time.sleep(0.25)
# env.releaseAllKeys()
env.restartRound()
needReset = False
reward = 0
numLoops = 0
runningCounter = 0
counter = 0
gameOver = False
print('resetFinished!')
else:
# timeStart = time.time()
env.act(action)
# print(f"Time to act: {time.time() - timeStart}")
# print('took action')
# print('got action')
runningReward += reward
# act_time = time.time() - act_time
# print("--- %s seconds to get do action ---" % (time.time() - start_time))
# print(f"running reward: {reward}")
client.log_returns(episode_id=episode_id, reward=reward)
# print('logged returns')
# Updating the model after every game in case there is a new one
numActions = numActions + 1
if gameOver and numActions > 25:
# if elapsed_time > 20:
# print("restarting due to elapsed time")
env.releaseAllKeys()
env.resetHP()
numActions = 0
if reward <= -1:
print(f"GAME OVER! WE Lost final reward: {runningReward}! Number of actions: {runningCounter}")
env.gameLog += f"GAME OVER! WE Lost final reward: {runningReward}! Number of actions: {runningCounter}\\n"
else:
print(f"GAME OVER! WE Won final reward: {runningReward}! Number of actions: {runningCounter}")
env.gameLog += f"GAME OVER! WE Won final reward: {runningReward}! Number of actions: {runningCounter}\n"
env.gameLog += str(env.rewards)
if runningReward >= -0.6:
folderString = f"reward-{round(runningReward, 4)}-{epochNum}-{runningCounter}"
fullString = os.getcwd() + "/replays/" + folderString
if reward >= 0.0:
fullString = os.getcwd() + "/replays/positive/" + folderString
elif reward >= -0.3:
fullString = os.getcwd() + "/replays/good/" + folderString
else:
fullString = os.getcwd() + "/replays/meh/" + folderString
Path(fullString).mkdir(parents=True, exist_ok=True)
f = open(fullString + "/log.txt", "a")
f.write(env.gameLog)
# this would be 10 minute long game
video_fps = ((runningCounter - counter) / numLoops) + (counter / fps)
if len(env.images) <= 6000:
fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
video = cv2.VideoWriter(fullString + '/video.avi', fourcc, video_fps, (x, y), False)
for img in env.images:
# img = img * 255.0
video.write(img.astype('uint8'))
video.release()
env.images = []
env.gameLog = ""
actionsUntilEpoch = actionsUntilEpoch - runningCounter
if actionsUntilEpoch < 0:
epochNum = epochNum + 1
print(f"Actions until epoch: {actionsUntilEpoch}, current epoch: {epochNum}")
print(env.rewards)
if actionsUntilEpoch < 0:
actionsUntilEpoch = epochActions
runningReward = 0
runningCounter = 0
reward = 0
numLoops = 0
# need to call a reset of env here
finalObs, reward, gameOver = env.getObservation()
old_id = episode_id
client.end_episode(episode_id=episode_id, observation=finalObs)
episode_id = client.start_episode(episode_id=None)
needReset = True
time.sleep(0.25)
# print('finished logging step')
# print("--- %s seconds to get finish logging return ---" % (time.time() - start_time))
# replayList.append((gameObservation, action, reward))
# print( f"Round: {gameObservation[5]} - Time Left: {gameObservation[12]} - Obs duration: {obs_time} - Act
# duration: {act_time} - Overall duration: {time.time() - start_time}")
Error Log
INFO:ray.rllib.evaluation.sampler:Raw obs from env: { 'c14d2a6b5fd645dbb34e18f7278d1f4d': { 'agent0': np.ndarray((240, 320, 1), dtype=float64, min=0.0, max=0.996, mean=0.666)}}
INFO:ray.rllib.evaluation.sampler:Info return from env: {'c14d2a6b5fd645dbb34e18f7278d1f4d': {'agent0': {}}}
INFO:ray.rllib.evaluation.sampler:Preprocessed obs: np.ndarray((240, 320, 1), dtype=float64, min=0.0, max=0.996, mean=0.666)
INFO:ray.rllib.evaluation.sampler:Filtered obs: np.ndarray((240, 320, 1), dtype=float64, min=0.0, max=0.996, mean=0.666)
WARNING:ray.rllib.evaluation.collectors.agent_collector:Provided tensor
[[[0.23529412]
[0.23137255]
[0.22745098]
...
[0.21960784]
[0.22745098]
[0.23137255]]
[[0.23529412]
[0.23137255]
[0.22352941]
...
[0.21568627]
[0.22352941]
[0.22745098]]
[[0.23137255]
[0.23137255]
[0.21960784]
...
[0.21176471]
[0.21960784]
[0.22352941]]
...
[[0.23529412]
[0.23137255]
[0.22745098]
...
[0.14509804]
[0.14901961]
[0.15294118]]
[[0.23529412]
[0.23137255]
[0.22745098]
...
[0.14901961]
[0.15294118]
[0.15686275]]
[[0.23529412]
[0.23529412]
[0.23137255]
...
[0.15294118]
[0.15686275]
[0.15686275]]]
does not match space of view requirements obs.
Provided tensor has shape (240, 320, 1) and view requirement has shape shape (240, 320, 1).Make sure dimensions match to resolve this warning.