Thanks a lot @mannyv for your elaborate answer. It took me some time to test out and debug example code that implements your suggestions. However, I always get a weird error that in debugging cannot be analyzed.
My environment looks now like this:
class MyEnv(gym.Env):
def __init__(self, config=None):
config = config or {}
self.price = config.get("price", 10)
self.mu = config.get("mu", 0.4)
self.sigma = config.get("sigma", 0.1)
self.timestep_limit = config.get("ts", 100)
observation_spaces = {
"price": Box(low=0.0, high=np.inf, shape=(1,), dtype=np.float64),
"position": Box(low=-self.timestep_limit, high=self.timestep_limit, shape=(1,), dtype=np.int32),
"entry": Box(low=0.0, high=np.inf, shape=(1,), dtype=np.float64)
}
self.observation_space = Dict(observation_spaces)
self.action_space = Discrete(3)
self.reset()
def reset(self):
self.price += np.random.normal(self.mu)*self.sigma
self.reward = 0.0
self.cumulated_reward = 0.0
self.position = 0
self.entry = 0.0
self.timesteps = 0
return self._get_obs()
def _get_obs(self):
return {
"price": np.array([self.price], dtype=np.float64),
"position": np.array([self.position], dtype=np.int32),
"entry": np.array([self.entry], dtype=np.float64)
}
def step(self, action):
action, _, _ = action
action = -1 if action == 2 else action
self.timesteps += 1
is_done = self.timesteps >= self.timestep_limit
self.position += action
self.entry = np.absolute(action)*self.price
self.price += np.random.normal(self.mu)*self.sigma
self.reward = (self.price-self.entry)*self.position
self.cumulated_reward += self.reward
obs = self._get_obs()
return obs, self.reward, is_done, {}
def render(self, mode=None):
print("Iteration: {}".format(self.timesteps))
print("Cumulated reward: {}".format(self.cumulated_reward))
print('Position: {}'.format(self.position))
print()
and my policy as follows:
from ray.rllib.policy.policy import Policy
from ray.rllib.utils.annotations import override
from ray.rllib.utils.typing import ModelWeights
class DummyTrainer(Policy):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.multiplicator = self.config.get("multiplicator", 1)
def compute_actions(self,
obs=None,
state_batches=None,
prev_action_batch=None,
prev_reward_batch=None,
**kwargs):
batch_size = obs['price'].shape[0]
return np.random.choice(3, size=[batch_size,1]), [], {}
def learn_on_batch(self, samples):
return
@override(Policy)
def get_weights(self) -> ModelWeights:
"""No weights to save."""
return {}
@override(Policy)
def set_weights(self, weights: ModelWeights) -> None:
"""No weights to set."""
pass
When I build my trainer and call train()
on it as follows
from ray.rllib.agents.trainer_template import build_trainer
ray.init(ignore_reinit_error=True)
from ray.rllib.agents.ppo import PPOTrainer
MyTrainer = build_trainer(
name="DummyTrainer",
default_policy=DummyTrainer
)
config = {
"env": MyEnv,
"env_config": {
"config": {
"ts": 50,
"mu": 20,
"sigma": 0.05,
},
},
"num_workers": 1,
"log_level": "DEBUG",
"create_env_on_driver": True,
}
my_trainer = MyTrainer(config=config)
I get the following output from it:
2021-07-03 14:07:49,084 INFO services.py:1267 -- View the Ray dashboard at http://127.0.0.1:8266
2021-07-03 14:07:50,569 INFO trainer.py:669 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
(pid=126088) WARNING:tensorflow:From /home/simon/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
(pid=126088) Instructions for updating:
(pid=126088) non-resource variables are not supported in the long term
(pid=126088) WARNING:tensorflow:From /home/simon/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
(pid=126088) Instructions for updating:
(pid=126088) non-resource variables are not supported in the long term
2021-07-03 14:07:52,595 DEBUG rollout_worker.py:1122 -- Creating policy for default_policy
2021-07-03 14:07:52,596 DEBUG preprocessors.py:249 -- Creating sub-preprocessor for Box(0.0, inf, (1,), float64)
2021-07-03 14:07:52,596 DEBUG preprocessors.py:249 -- Creating sub-preprocessor for Box(-100, 100, (1,), int32)
2021-07-03 14:07:52,597 DEBUG preprocessors.py:249 -- Creating sub-preprocessor for Box(0.0, inf, (1,), float64)
2021-07-03 14:07:52,597 DEBUG catalog.py:631 -- Created preprocessor <ray.rllib.models.preprocessors.DictFlatteningPreprocessor object at 0x7fd85c00d370>: Dict(entry:Box(0.0, inf, (1,), float64), position:Box(-100, 100, (1,), int32), price:Box(0.0, inf, (1,), float64)) -> (3,)
2021-07-03 14:07:52,598 INFO rollout_worker.py:1161 -- Built policy map: {'default_policy': <__main__.DummyTrainer object at 0x7fd85c00d850>}
2021-07-03 14:07:52,599 INFO rollout_worker.py:1162 -- Built preprocessor map: {'default_policy': <ray.rllib.models.preprocessors.DictFlatteningPreprocessor object at 0x7fd85c00d370>}
2021-07-03 14:07:52,600 DEBUG rollout_worker.py:531 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
2021-07-03 14:07:52,602 INFO rollout_worker.py:563 -- Built filter map: {'default_policy': <ray.rllib.utils.filter.NoFilter object at 0x7fd85c00d340>}
2021-07-03 14:07:52,603 DEBUG rollout_worker.py:678 -- Created rollout worker with env <ray.rllib.env.base_env._VectorEnvToBaseEnv object at 0x7fd85c00dd00> (<MyEnv instance>), policies {'default_policy': <__main__.DummyTrainer object at 0x7fd85c00d850>}
2021-07-03 14:07:52,608 WARNING util.py:53 -- Install gputil for GPU system monitoring.
<ray.rllib.agents.trainer_template.DummyTrainer at 0x7fd85c086100>
and from my_trainer.train()
:
2021-07-03 14:07:58,877 INFO trainer.py:569 -- Worker crashed during call to train(). To attempt to continue training without the failed worker, set `'ignore_worker_failures': True`.
---------------------------------------------------------------------------
RayTaskError(IndexError) Traceback (most recent call last)
<ipython-input-10-5523c1af32b5> in <module>
----> 1 results = my_trainer.train()
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/rllib/agents/trainer.py in train(self)
571 "continue training without the failed worker, set "
572 "`'ignore_worker_failures': True`.")
--> 573 raise e
574 except Exception as e:
575 time.sleep(0.5) # allow logs messages to propagate
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/rllib/agents/trainer.py in train(self)
560 for _ in range(1 + MAX_WORKER_FAILURE_RETRIES):
561 try:
--> 562 result = Trainable.train(self)
563 except RayError as e:
564 if self.config["ignore_worker_failures"]:
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/tune/trainable.py in train(self)
230 """
231 start = time.time()
--> 232 result = self.step()
233 assert isinstance(result, dict), "step() needs to return a dict."
234
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/rllib/agents/trainer_template.py in step(self)
160 @override(Trainer)
161 def step(self):
--> 162 res = next(self.train_exec_impl)
163
164 # self._iteration gets incremented after this function returns,
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/util/iter.py in __next__(self)
754 def __next__(self):
755 self._build_once()
--> 756 return next(self.built_iterator)
757
758 def __str__(self):
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/util/iter.py in apply_foreach(it)
781
782 def apply_foreach(it):
--> 783 for item in it:
784 if isinstance(item, _NextValueNotReady):
785 yield item
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/util/iter.py in apply_filter(it)
841 def filter(self, fn: Callable[[T], bool]) -> "LocalIterator[T]":
842 def apply_filter(it):
--> 843 for item in it:
844 with self._metrics_context():
845 if isinstance(item, _NextValueNotReady) or fn(item):
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/util/iter.py in apply_filter(it)
841 def filter(self, fn: Callable[[T], bool]) -> "LocalIterator[T]":
842 def apply_filter(it):
--> 843 for item in it:
844 with self._metrics_context():
845 if isinstance(item, _NextValueNotReady) or fn(item):
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/util/iter.py in apply_foreach(it)
781
782 def apply_foreach(it):
--> 783 for item in it:
784 if isinstance(item, _NextValueNotReady):
785 yield item
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/util/iter.py in apply_flatten(it)
874 def flatten(self) -> "LocalIterator[T[0]]":
875 def apply_flatten(it):
--> 876 for item in it:
877 if isinstance(item, _NextValueNotReady):
878 yield item
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/util/iter.py in add_wait_hooks(it)
826 fn._on_fetch_start()
827 new_item = False
--> 828 item = next(it)
829 if not isinstance(item, _NextValueNotReady):
830 new_item = True
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/util/iter.py in apply_foreach(it)
781
782 def apply_foreach(it):
--> 783 for item in it:
784 if isinstance(item, _NextValueNotReady):
785 yield item
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/util/iter.py in apply_foreach(it)
781
782 def apply_foreach(it):
--> 783 for item in it:
784 if isinstance(item, _NextValueNotReady):
785 yield item
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/util/iter.py in apply_foreach(it)
781
782 def apply_foreach(it):
--> 783 for item in it:
784 if isinstance(item, _NextValueNotReady):
785 yield item
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/util/iter.py in base_iterator(timeout)
469 while active:
470 try:
--> 471 yield ray.get(futures, timeout=timeout)
472 futures = [a.par_iter_next.remote() for a in active]
473 # Always yield after each round of gets with timeout.
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/_private/client_mode_hook.py in wrapper(*args, **kwargs)
45 if client_mode_should_convert():
46 return getattr(ray, func.__name__)(*args, **kwargs)
---> 47 return func(*args, **kwargs)
48
49 return wrapper
~/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/worker.py in get(object_refs, timeout)
1479 worker.core_worker.dump_object_store_memory_usage()
1480 if isinstance(value, RayTaskError):
-> 1481 raise value.as_instanceof_cause()
1482 else:
1483 raise value
RayTaskError(IndexError): ray::RolloutWorker.par_iter_next() (pid=126088, ip=192.168.1.111)
File "python/ray/_raylet.pyx", line 505, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 449, in ray._raylet.execute_task.function_executor
File "/home/simon/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/_private/function_manager.py", line 556, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File "/home/simon/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/util/iter.py", line 1152, in par_iter_next
return next(self.local_it)
File "/home/simon/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 332, in gen_rollouts
yield self.sample()
File "/home/simon/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 706, in sample
batches = [self.input_reader.next()]
File "/home/simon/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 96, in next
batches = [self.get_data()]
File "/home/simon/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 223, in get_data
item = next(self.rollout_provider)
File "/home/simon/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 622, in _env_runner
eval_results = _do_policy_eval(
File "/home/simon/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 1003, in _do_policy_eval
policy.compute_actions_from_input_dict(
File "/home/simon/git-projects/forex-strategy-learning/.venv/lib/python3.8/site-packages/ray/rllib/policy/policy.py", line 280, in compute_actions_from_input_dict
return self.compute_actions(
File "<ipython-input-3-32722a609754>", line 17, in compute_actions
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
Debugging my code I get always when compute_actions
is called the obs
object and can also extract obs['price']
with shape[0]
equal to one. Therefore I do not understand the error, which somehow points towards a non-integer index in numpy. Does anyone has an idea? @mannyv, @kai, or @sven1977 maybe?
Thanks in advance to everyone who reads and thinks 