Help needed with a simple demo

1. Severity of the issue: (select one)
High: Completely blocks me.

2. Environment:

  • Ray version: ray, version 2.49.1
  • Python version: 3.9
  • OS: Linux iZuf69e0i0z4iawhy8o915Z 6.8.0-60-generic #63-Ubuntu SMP PREEMPT_DYNAMIC Tue Apr 15 19:04:15 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux
  • Cloud/Infrastructure:
  • Other libs/tools (if relevant):

3. What happened vs. what you expected:

  • Expected: The training should run up normally
  • Actual: Error occurs

My script:

import ray.train.torch
import os
import tempfile

import torch
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision.models import resnet18
from torchvision.datasets import FashionMNIST
from torchvision.transforms import ToTensor, Normalize, Compose

import s3fs
import pyarrow.fs
import os

os.environ['RAY_record_ref_creation_sites'] = "1"
os.environ["RAY_TRAIN_V2_ENABLED"] = "1"

s3_fs = s3fs.S3FileSystem(
    key='minioadmin',
    secret='minioadmin',
    endpoint_url='http://10.40.126.149:9000'
)
custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs))


def train_func():
    # Model, Loss, Optimizer
    model = resnet18(num_classes=10)
    model.conv1 = torch.nn.Conv2d(
        1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
    )
    # [1] Prepare model.
    model = ray.train.torch.prepare_model(model)
    # model.to("cuda")  # This is done by `prepare_model`
    criterion = CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=0.001)

    # Data
    transform = Compose([ToTensor(), Normalize((0.28604,), (0.32025,))])
    data_dir = os.path.join(tempfile.gettempdir(), "data")
    train_data = FashionMNIST(
        root=data_dir, train=True, download=True, transform=transform)
    train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
    # [2] Prepare dataloader.
    train_loader = ray.train.torch.prepare_data_loader(train_loader)

    # Training
    for epoch in range(10):
        if ray.train.get_context().get_world_size() > 1:
            train_loader.sampler.set_epoch(epoch)

        for images, labels in train_loader:
            # This is done by `prepare_data_loader`!
            # images, labels = images.to("cuda"), labels.to("cuda")
            outputs = model(images)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # [3] Report metrics and checkpoint.
        metrics = {"loss": loss.item(), "epoch": epoch}
        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            torch.save(
                model.state_dict(),
                os.path.join(temp_checkpoint_dir, "model.pt")
            )
            # if ray.train.get_context().get_world_rank() == 0:
            ray.train.report(
                    metrics,
                    checkpoint=ray.train.Checkpoint.from_directory(
                        temp_checkpoint_dir),
                )
            # else:
                # ray.train.report(metrics)
            print(metrics)


# [4] Configure scaling and resource requirements.
scaling_config = ray.train.ScalingConfig(num_workers=3, use_gpu=False)
run_config = ray.train.RunConfig(
    #storage_path="xtrain", storage_filesystem=custom_fs
    )

# [5] Launch distributed training job.
trainer = ray.train.torch.TorchTrainer(
    train_func,
    scaling_config=scaling_config,
    run_config=run_config,
)
result = trainer.fit()

# [6] Load the trained model.
with result.checkpoint.as_directory() as checkpoint_dir:
    model_state_dict = torch.load(os.path.join(checkpoint_dir, "model.pt"))
    model = resnet18(num_classes=10)
    model.conv1 = torch.nn.Conv2d(
        1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
    )
    model.load_state_dict(model_state_dict)

error msg:

python test.py
2025-09-16 18:31:56,374 INFO worker.py:1771 – Connecting to existing Ray cluster at address: 10.40.126.149:6379…
2025-09-16 18:31:56,382 INFO worker.py:1942 – Connected to Ray cluster. View the dashboard at 10.40.126.149:8265
2025-09-16 18:31:56,414 INFO tune.py:253 – Initializing Ray automatically. For cluster usage or custom Ray initialization, call ray.init(...) before <FrameworkTrainer>(...).

View detailed results here: xtrain/TorchTrainer_2025-09-16_18-31-56
To visualize your results with TensorBoard, run: tensorboard --logdir /tmp/ray/session_2025-09-16_18-07-54_993406_6329/artifacts/2025-09-16_18-31-56/TorchTrainer_2025-09-16_18-31-56/driver_artifacts
2025-09-16 18:31:56,433 INFO data_parallel_trainer.py:339 – GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set use_gpu to True in your scaling config.
2025-09-16 18:31:59,080 ERROR tune_controller.py:1331 – Trial task failed for trial TorchTrainer_5e83b_00000
Traceback (most recent call last):
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py”, line 110, in resolve_future
result = ray.get(future)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/_private/auto_init_hook.py”, line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/_private/client_mode_hook.py”, line 104, in wrapper
return func(*args, **kwargs)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/_private/worker.py”, line 2882, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/_private/worker.py”, line 970, in get_objects
raise value
ray.exceptions.ActorDiedError: The actor died because of an error raised in its creation task, ray::_Inner.init() (pid=8557, ip=10.40.126.148, actor_id=d3b3c0e44c6e9ca24416166d08000000, repr=TorchTrainer)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/tune/trainable/trainable.py”, line 158, in init
self.setup(copy.deepcopy(self.config))
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/tune/trainable/util.py”, line 120, in setup
super(_Inner, self).setup(config, **setup_kwargs)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 845, in setup
] = self._reconcile_scaling_config_with_trial_resources(
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 876, in _reconcile_scaling_config_with_trial_resources
trainer_cls._validate_scaling_config(scaling_config)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/data_parallel_trainer.py”, line 332, in _validate_scaling_config
scaling_config = super(DataParallelTrainer, cls)._validate_scaling_config(
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 595, in _validate_scaling_config
ensure_only_allowed_dataclass_keys_updated(
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/_internal/config.py”, line 38, in ensure_only_allowed_dataclass_keys_updated
bad_keys = [
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/_internal/config.py”, line 41, in
if dataclass.dict[key] != default_data.dict[key]
KeyError: ‘use_tpu’

Training errored after 0 iterations at 2025-09-16 18:31:59. Total running time: 2s
Error file: /tmp/ray/session_2025-09-16_18-07-54_993406_6329/artifacts/2025-09-16_18-31-56/TorchTrainer_2025-09-16_18-31-56/driver_artifacts/TorchTrainer_5e83b_00000_0_2025-09-16_18-31-56/error.txt
2025-09-16 18:31:59,098 INFO tune.py:1009 – Wrote the latest version of all result files and experiment state to ‘xtrain/TorchTrainer_2025-09-16_18-31-56’ in 0.0163s.

2025-09-16 18:31:59,098 ERROR tune.py:1037 – Trials did not complete: [TorchTrainer_5e83b_00000]
2025-09-16 18:31:59,115 WARNING experiment_analysis.py:180 – Failed to fetch metrics for 1 trial(s):

  • TorchTrainer_5e83b_00000: FileNotFoundError(‘Could not fetch metrics for TorchTrainer_5e83b_00000: both result.json and progress.csv were not found at xtrain/TorchTrainer_2025-09-16_18-31-56/TorchTrainer_5e83b_00000_0_2025-09-16_18-31-56’)
    ray.exceptions.ActorDiedError: The actor died because of an error raised in its creation task, ray::_Inner.init() (pid=8557, ip=10.40.126.148, actor_id=d3b3c0e44c6e9ca24416166d08000000, repr=TorchTrainer)
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/tune/trainable/trainable.py”, line 158, in init
    self.setup(copy.deepcopy(self.config))
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/tune/trainable/util.py”, line 120, in setup
    super(_Inner, self).setup(config, **setup_kwargs)
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 845, in setup
    ] = self._reconcile_scaling_config_with_trial_resources(
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 876, in _reconcile_scaling_config_with_trial_resources
    trainer_cls._validate_scaling_config(scaling_config)
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/data_parallel_trainer.py”, line 332, in _validate_scaling_config
    scaling_config = super(DataParallelTrainer, cls)._validate_scaling_config(
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 595, in _validate_scaling_config
    ensure_only_allowed_dataclass_keys_updated(
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/_internal/config.py”, line 38, in ensure_only_allowed_dataclass_keys_updated
    bad_keys = [
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/_internal/config.py”, line 41, in
    if dataclass.dict[key] != default_data.dict[key]
    KeyError: ‘use_tpu’

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File “/home/ecs-user/xtrain/test/test.py”, line 92, in
result = trainer.fit()
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 722, in fit
raise TrainingFailedError(
ray.train.base_trainer.TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: trainer = TorchTrainer.restore("xtrain/TorchTrainer_2025-09-16_18-31-56").
To start a new run that will retry on training failures, set train.RunConfig(failure_config=train.FailureConfig(max_failures)) in the Trainer’s run_config with max_failures > 0, or max_failures = -1 for unlimited retries.
(TrainTrainable pid=8557, ip=10.40.126.148) Exception raised in creation task: The actor died because of an error raised in its creation task, ray::_Inner.init() (pid=8557, ip=10.40.126.148, actor_id=d3b3c0e44c6e9ca24416166d08000000, repr=TorchTrainer)
(TrainTrainable pid=8557, ip=10.40.126.148) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/tune/trainable/trainable.py”, line 158, in init
(TrainTrainable pid=8557, ip=10.40.126.148) self.setup(copy.deepcopy(self.config))
(TrainTrainable pid=8557, ip=10.40.126.148) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/tune/trainable/util.py”, line 120, in setup
(TrainTrainable pid=8557, ip=10.40.126.148) super(_Inner, self).setup(config, **setup_kwargs)
(TrainTrainable pid=8557, ip=10.40.126.148) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 845, in setup
(TrainTrainable pid=8557, ip=10.40.126.148) ] = self._reconcile_scaling_config_with_trial_resources(
(TrainTrainable pid=8557, ip=10.40.126.148) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 876, in _reconcile_scaling_config_with_trial_resources
(TrainTrainable pid=8557, ip=10.40.126.148) trainer_cls._validate_scaling_config(scaling_config)
(TrainTrainable pid=8557, ip=10.40.126.148) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/data_parallel_trainer.py”, line 332, in _validate_scaling_config
(TrainTrainable pid=8557, ip=10.40.126.148) scaling_config = super(DataParallelTrainer, cls)._validate_scaling_config(
(TrainTrainable pid=8557, ip=10.40.126.148) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 595, in _validate_scaling_config
(TrainTrainable pid=8557, ip=10.40.126.148) ensure_only_allowed_dataclass_keys_updated(
(TrainTrainable pid=8557, ip=10.40.126.148) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/_internal/config.py”, line 38, in ensure_only_allowed_dataclass_keys_updated
(TrainTrainable pid=8557, ip=10.40.126.148) bad_keys = [
(TrainTrainable pid=8557, ip=10.40.126.148) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/_internal/config.py”, line 41, in
(TrainTrainable pid=8557, ip=10.40.126.148) if dataclass.dict[key] != default_data.dict[key]
(TrainTrainable pid=8557, ip=10.40.126.148) KeyError: ‘use_tpu’
(ray) ➜ test python test.py
2025-09-16 18:32:12,088 INFO worker.py:1771 – Connecting to existing Ray cluster at address: 10.40.126.149:6379…
2025-09-16 18:32:12,096 INFO worker.py:1942 – Connected to Ray cluster. View the dashboard at 10.40.126.149:8265
2025-09-16 18:32:12,128 INFO tune.py:253 – Initializing Ray automatically. For cluster usage or custom Ray initialization, call ray.init(...) before <FrameworkTrainer>(...).

View detailed results here: xtrain/TorchTrainer_2025-09-16_18-32-11
To visualize your results with TensorBoard, run: tensorboard --logdir /tmp/ray/session_2025-09-16_18-07-54_993406_6329/artifacts/2025-09-16_18-32-12/TorchTrainer_2025-09-16_18-32-11/driver_artifacts
2025-09-16 18:32:12,147 INFO data_parallel_trainer.py:339 – GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set use_gpu to True in your scaling config.
2025-09-16 18:32:14,536 ERROR tune_controller.py:1331 – Trial task failed for trial TorchTrainer_67e18_00000
Traceback (most recent call last):
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py”, line 110, in resolve_future
result = ray.get(future)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/_private/auto_init_hook.py”, line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/_private/client_mode_hook.py”, line 104, in wrapper
return func(*args, **kwargs)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/_private/worker.py”, line 2882, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/_private/worker.py”, line 970, in get_objects
raise value
ray.exceptions.ActorDiedError: The actor died because of an error raised in its creation task, ray::_Inner.init() (pid=10580, ip=10.40.126.149, actor_id=c367ee4ed7085284d2a1418409000000, repr=TorchTrainer)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/tune/trainable/trainable.py”, line 158, in init
self.setup(copy.deepcopy(self.config))
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/tune/trainable/util.py”, line 120, in setup
super(_Inner, self).setup(config, **setup_kwargs)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 845, in setup
] = self._reconcile_scaling_config_with_trial_resources(
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 876, in _reconcile_scaling_config_with_trial_resources
trainer_cls._validate_scaling_config(scaling_config)
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/data_parallel_trainer.py”, line 332, in _validate_scaling_config
scaling_config = super(DataParallelTrainer, cls)._validate_scaling_config(
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 595, in _validate_scaling_config
ensure_only_allowed_dataclass_keys_updated(
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/_internal/config.py”, line 38, in ensure_only_allowed_dataclass_keys_updated
bad_keys = [
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/_internal/config.py”, line 41, in
if dataclass.dict[key] != default_data.dict[key]
KeyError: ‘topology’

Training errored after 0 iterations at 2025-09-16 18:32:14. Total running time: 2s
Error file: /tmp/ray/session_2025-09-16_18-07-54_993406_6329/artifacts/2025-09-16_18-32-12/TorchTrainer_2025-09-16_18-32-11/driver_artifacts/TorchTrainer_67e18_00000_0_2025-09-16_18-32-12/error.txt
2025-09-16 18:32:14,555 INFO tune.py:1009 – Wrote the latest version of all result files and experiment state to ‘xtrain/TorchTrainer_2025-09-16_18-32-11’ in 0.0177s.

2025-09-16 18:32:14,555 ERROR tune.py:1037 – Trials did not complete: [TorchTrainer_67e18_00000]
2025-09-16 18:32:14,571 WARNING experiment_analysis.py:180 – Failed to fetch metrics for 1 trial(s):

  • TorchTrainer_67e18_00000: FileNotFoundError(‘Could not fetch metrics for TorchTrainer_67e18_00000: both result.json and progress.csv were not found at xtrain/TorchTrainer_2025-09-16_18-32-11/TorchTrainer_67e18_00000_0_2025-09-16_18-32-12’)
    ray.exceptions.ActorDiedError: The actor died because of an error raised in its creation task, ray::_Inner.init() (pid=10580, ip=10.40.126.149, actor_id=c367ee4ed7085284d2a1418409000000, repr=TorchTrainer)
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/tune/trainable/trainable.py”, line 158, in init
    self.setup(copy.deepcopy(self.config))
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/tune/trainable/util.py”, line 120, in setup
    super(_Inner, self).setup(config, **setup_kwargs)
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 845, in setup
    ] = self._reconcile_scaling_config_with_trial_resources(
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 876, in _reconcile_scaling_config_with_trial_resources
    trainer_cls._validate_scaling_config(scaling_config)
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/data_parallel_trainer.py”, line 332, in _validate_scaling_config
    scaling_config = super(DataParallelTrainer, cls)._validate_scaling_config(
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 595, in _validate_scaling_config
    ensure_only_allowed_dataclass_keys_updated(
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/_internal/config.py”, line 38, in ensure_only_allowed_dataclass_keys_updated
    bad_keys = [
    File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/_internal/config.py”, line 41, in
    if dataclass.dict[key] != default_data.dict[key]
    KeyError: ‘topology’

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File “/home/ecs-user/xtrain/test/test.py”, line 92, in
result = trainer.fit()
File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 722, in fit
raise TrainingFailedError(
ray.train.base_trainer.TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: trainer = TorchTrainer.restore("xtrain/TorchTrainer_2025-09-16_18-32-11").
To start a new run that will retry on training failures, set train.RunConfig(failure_config=train.FailureConfig(max_failures)) in the Trainer’s run_config with max_failures > 0, or max_failures = -1 for unlimited retries.
(TrainTrainable pid=10580) Exception raised in creation task: The actor died because of an error raised in its creation task, ray::_Inner.init() (pid=10580, ip=10.40.126.149, actor_id=c367ee4ed7085284d2a1418409000000, repr=TorchTrainer)
(TrainTrainable pid=10580) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/tune/trainable/trainable.py”, line 158, in init
(TrainTrainable pid=10580) self.setup(copy.deepcopy(self.config))
(TrainTrainable pid=10580) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/tune/trainable/util.py”, line 120, in setup
(TrainTrainable pid=10580) super(_Inner, self).setup(config, **setup_kwargs)
(TrainTrainable pid=10580) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 845, in setup
(TrainTrainable pid=10580) ] = self._reconcile_scaling_config_with_trial_resources(
(TrainTrainable pid=10580) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 876, in _reconcile_scaling_config_with_trial_resources
(TrainTrainable pid=10580) trainer_cls._validate_scaling_config(scaling_config)
(TrainTrainable pid=10580) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/data_parallel_trainer.py”, line 332, in _validate_scaling_config
(TrainTrainable pid=10580) scaling_config = super(DataParallelTrainer, cls)._validate_scaling_config(
(TrainTrainable pid=10580) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 595, in _validate_scaling_config
(TrainTrainable pid=10580) ensure_only_allowed_dataclass_keys_updated(
(TrainTrainable pid=10580) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/_internal/config.py”, line 38, in ensure_only_allowed_dataclass_keys_updated
(TrainTrainable pid=10580) bad_keys = [
(TrainTrainable pid=10580) File “/home/ecs-user/anaconda3/envs/ray/lib/python3.9/site-packages/ray/air/_internal/config.py”, line 41, in
(TrainTrainable pid=10580) if dataclass.dict[key] != default_data.dict[key]
(TrainTrainable pid=10580) KeyError: ‘topology’

Hi there! Can you let me know where in the docs you found this example so we can take a look?

It looks like the head node might have a version mismatch with the other nodes in the cluster. Do you mind running these commands and lmk what they say?

SSH into each node and run:

python -c "import ray; print(ray.__version__)"
python --version