Correct way of resuming trials

TL;DR: I am trying to set a cluster with preemptible workers on GCP but I have issues with the resume.

I am trying to understand the resume behavior by randomly shut down one of the workers with ray kill-random-node -y gcp_our.yaml --hard and I have seen that, if I do not set a resume (LOCAL or anything else) when a worker dies the experiment is restarted from the first iteration. When a resume is set to LOCAL, I get: Called resume when no checkpoint exists in local directory.
My desired behavior would be that, when a worker dies, the experiment restarts from the current iteration and not from the beginning. How could I do that?

Here the code that I am using:

from __future__ import print_function

import ray
from ray import tune
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.suggest.ax import AxClient, AxSearch
from ray.tune.suggest import ConcurrencyLimiter
import torch

from configs import METRIC, MINIMIZE

## TRAINABLE

class TrainKMNIST(tune.Trainable):

    def setup(self, config):
        use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if use_cuda else "cpu")
        self.train_loader, self.test_loader = get_data_loaders()
        self.model = ConvNet().to(self.device)
        self.optimizer = optim.SGD(
            self.model.parameters(),
            lr=config.get("lr", 0.01),
            momentum=config.get("momentum", 0.9))

    def step(self):
        train(
            self.model, self.optimizer, self.train_loader, device=self.device)
        acc = test(self.model, self.test_loader, self.device)
        return {"mean_accuracy": acc}

    def save_checkpoint(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "model.pth")
        torch.save(self.model.state_dict(), checkpoint_path)
        return checkpoint_path

    def load_checkpoint(self, checkpoint_path):
        self.model.load_state_dict(torch.load(checkpoint_path))

if __name__ == "__main__":

    try:
        ray.init(address="auto")
    except:
        ray.init()


    # FOR EARLY STOPPING
    # see https://docs.ray.io/en/master/tune/api_docs/schedulers.html to chose a scheduler
    sched = AsyncHyperBandScheduler(
        metric=METRIC,  # criteria on which chose experiments
        mode="min" if MINIMIZE else "max",  # ..according to metric
        reduction_factor=2,  # survival rate for concurrent experiments
        grace_period=8,  # each experiments runs at least grace_period before to be stopped from scheduler
    )

    ## NEW EXPERIMENTS PROPOSAL
    # see https://docs.ray.io/en/master/tune/api_docs/suggestion.html to chose a search algorithm

    # see https://ax.dev/tutorials/tune_cnn.html for parameters
    parameters = [
        {
            "name": "lr",
            "type": "range",
            "bounds": [1e-6, 0.4],
            "value_type": "float",
            "log_scale": True,
        },
        {
            "name": "momentum",
            "type": "range",
            "value_type": "float",
            "bounds": [0.0, 1.0],
        },
    ]

    # see https://ax.dev/versions/latest/tutorials/gpei_hartmann_service.html for experiment setup
    client = AxClient(enforce_sequential_optimization=False, verbose_logging=False)
    client.create_experiment(
        name="nome", parameters=parameters, objective_name=METRIC, minimize=MINIMIZE,
    )
    searc = AxSearch(ax_client=client)
    # , metric=METRIC, mode="min" if MINIMIZE else "max")
    searc = ConcurrencyLimiter(searc, max_concurrent=12)

    # both of these should return
    validate_save_restore(TrainKMNIST)
    validate_save_restore(TrainKMNIST, use_object_store=True)

    analysis = tune.run(
        TrainKMNIST,
        scheduler=sched,
        search_alg=searc,
        stop={"mean_accuracy": 1.1, "training_iteration": 500},
        resources_per_trial={
            "cpu": 2,
            "gpu": int(ray.cluster_resources()["GPU"] / ray.cluster_resources()["GPU"]) / 2,
        },  # EDIT
        num_samples=20,
        # metric="mean_accuracy",
        # mode="max",
        #name="ray_results",
        #local_dir=HOME,
        max_failures=10,
        checkpoint_freq=3,
        checkpoint_at_end=True,
        resume=LOCAL
        # config=parameters,
    )

    print(
        "Best config is:",
        analysis.get_best_config(metric=METRIC, mode="min" if MINIMIZE else "max"),
    )

while the .yaml is:

# An unique identifier for the head node and workers of this cluster.
cluster_name: test

# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 2

# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2

# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker: {}

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5

# Cloud-provider specific configuration.
provider:
    type: gcp
    region: europe-west1
    availability_zone: europe-west1-b
    project_id: humanitas-rad-ai-20-00

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below. This requires that you have added the key into the
# project wide meta-data.
#    ssh_private_key: /path/to/your/key.pem

# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
head_node:
    machineType: n1-standard-4
    disks:
      - boot: true
        autoDelete: true
        type: PERSISTENT
        initializeParams:
          diskSizeGb: 50
          # See https://cloud.google.com/compute/docs/images for more images
          sourceImage: projects/deeplearning-platform-release/global/images/family/pytorch-latest-cu101-debian-10

    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert

    # If the network interface is specified as below in both head and worker
    # nodes, the manual network config is used.  Otherwise an existing subnet is
    # used.  To use a shared subnet, ask the subnet owner to grant permission
    # for 'compute.subnetworks.use' to the ray autoscaler account...
    # networkInterfaces:
    #   - kind: compute#networkInterface
    #     subnetwork: path/to/subnet
    #     aliasIpRanges: []

worker_nodes:
    machineType: n1-standard-4
    disks:
      - boot: true
        autoDelete: true
        type: PERSISTENT
        initializeParams:
          diskSizeGb: 50
          # See https://cloud.google.com/compute/docs/images for more images
          sourceImage: projects/deeplearning-platform-release/global/images/family/pytorch-latest-cu101-debian-10
    guestAccelerators:
      - acceleratorType: projects/humanitas-rad-ai-20-00/zones/europe-west1-b/acceleratorTypes/nvidia-tesla-k80
        acceleratorCount: 1
    metadata:
      items:
        - key: install-nvidia-driver
          value: "True"
    scheduling:
      - onHostMaintenance: TERMINATE
      - preemptible: true

    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert

# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
  "./main.py": "./main.py",
  "./configs.py": "./configs.py",
#    "/path1/on/remote/machine": "/path1/on/local/machine",
}

# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []

# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False

# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude: []

# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter: []

# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []

# List of shell commands to run to set up nodes.
setup_commands:
    # Note: if you're developing Ray, you probably want to create an AMI that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc

    # Install MiniConda.
    - >-
      wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/anaconda3.sh
      || true
      && bash ~/anaconda3.sh -b -p ~/anaconda3 || true
      && rm ~/anaconda3.sh
      && echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.profile

    # Install ray
    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
    - pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
    - pip install pandas ray[tune] ax-platform sqlalchemy

# Custom commands that will be run on the head node after common setup.
head_setup_commands:
  - pip install google-api-python-client==1.7.8

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
    - ray stop
    - >-
      ulimit -n 65536;
      ray start
      --head
      --port=6379
      --object-manager-port=8076
      --autoscaling-config=~/ray_bootstrap_config.yaml

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - >-
      ulimit -n 65536;
      ray start
      --address=$RAY_HEAD_IP:6379
      --object-manager-port=8076
      

Thank you in advance,

Riccardo

1 Like

Can you post the output of the running experiment?

Yes sure. I removed GPUs from workers from simplicity.
Here an example of what my real issue is:

With tune.run as:

analysis = tune.run(
        TrainKMNIST,
        scheduler=sched,
        search_alg=searc,
        stop={"mean_accuracy": 0.9, "training_iteration": 500},
        resources_per_trial={
            "cpu": 2,
            "gpu": 0
        },  # EDIT
        num_samples=20,
        name="tune_exp",
        max_failures=-1,
        checkpoint_freq=3,
    )

After a while the situation was:

Memory usage on this node: 1.5/14.7 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 64.000: None | Iter 32.000: 0.6 | Iter 16.000: 0.51875 | Iter 8.000: 0.28125
Resources requested: 12.0/12 CPUs, 0/0 GPUs, 0.0/28.68 GiB heap, 0.0/12.9 GiB objects
Result logdir: /home/ubuntu/ray_results/tune_exp
Number of trials: 17/20 (6 PENDING, 6 RUNNING, 5 TERMINATED)
+----------------------+------------+--------------------+-------------+-------------+----------+--------+------------------+
| Trial name           | status     | loc                |          lr |    momentum |      acc |   iter |   total time (s) |
|----------------------+------------+--------------------+-------------+-------------+----------+--------+------------------|
| TrainKMNIST_38678fd4 | RUNNING    | 10.132.15.230:8615 | 0.0137873   | 0.136199    | 0.609375 |     27 |         8.82524  |
| TrainKMNIST_386b2716 | RUNNING    | 10.132.0.4:8637    | 0.0290271   | 0.107078    | 0.584375 |     17 |         5.59015  |
| TrainKMNIST_386f4166 | RUNNING    | 10.132.0.3:9523    | 0.00513989  | 0.254272    | 0.596875 |     62 |        19.2448   |
| TrainKMNIST_38745340 | RUNNING    | 10.132.0.3:9525    | 0.0155962   | 0.0215544   | 0.61875  |     59 |        18.8971   |
| TrainKMNIST_389cc24e | RUNNING    | 10.132.15.230:9079 | 0.000340849 | 0.753652    | 0.0875   |      1 |         0.316088 |
| TrainKMNIST_38a6a05c | RUNNING    |                    | 0.00679255  | 0.478938    |          |        |                  |
| TrainKMNIST_38b1b6a4 | PENDING    |                    | 0.000877999 | 0.244587    |          |        |                  |
| TrainKMNIST_41cabace | PENDING    |                    | 0.000178092 | 0.241587    |          |        |                  |
| TrainKMNIST_44bfa488 | PENDING    |                    | 0.00015556  | 0.13273     |          |        |                  |
| TrainKMNIST_461e2cc8 | PENDING    |                    | 0.307777    | 0.73092     |          |        |                  |
| TrainKMNIST_48b015be | PENDING    |                    | 0.108249    | 0.413707    |          |        |                  |
| TrainKMNIST_4a8ef710 | PENDING    |                    | 1e-06       | 1.63526e-16 |          |        |                  |
| TrainKMNIST_38653220 | TERMINATED |                    | 1.22015e-06 | 0.929689    | 0.134375 |      8 |         2.47136  |
| TrainKMNIST_387a0f74 | TERMINATED |                    | 0.00282857  | 0.324133    | 0.28125  |      8 |         2.58102  |
| TrainKMNIST_3880b32e | TERMINATED |                    | 0.00438698  | 0.601976    | 0.290625 |      8 |         2.48458  |
| TrainKMNIST_38891a1e | TERMINATED |                    | 0.000163046 | 0.813097    | 0.08125  |      8 |         2.53252  |
| TrainKMNIST_38934516 | TERMINATED |                    | 1.159e-05   | 0.653644    | 0.15     |      8 |         2.57075  |
+----------------------+------------+--------------------+-------------+-------------+----------+--------+------------------+

Then, I ran the ray kill-random-node -y gcp_our.yaml --hard command and I get:

2021-03-12 09:14:07,036	ERROR trial_runner.py:894 -- Trial TrainKMNIST_38678fd4: Error handling checkpoint /home/ubuntu/ray_results/tune_exp/TrainKMNIST_38678fd4_2_lr=0.013787,momentum=0.1362_2021-03-12_09-13-35/checkpoint_000030/model.pth
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/trial_runner.py", line 886, in _process_trial_save
    self._callbacks.on_checkpoint(
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/callback.py", line 216, in on_checkpoint
    callback.on_checkpoint(**info)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/syncer.py", line 455, in on_checkpoint
    self._sync_trial_checkpoint(trial, checkpoint)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/syncer.py", line 425, in _sync_trial_checkpoint
    raise e
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/syncer.py", line 411, in _sync_trial_checkpoint
    trial_syncer.wait()
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/syncer.py", line 202, in wait
    self.sync_client.wait()
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/sync_client.py", line 235, in wait
    raise TuneError("Sync error. Ran command: {}\n"
ray.tune.error.TuneError: Sync error. Ran command: rsync  -savz -e 'ssh -i /home/ubuntu/ray_bootstrap_key.pem -o ConnectTimeout=120s -o StrictHostKeyChecking=no' ubuntu@10.132.15.230:/home/ubuntu/ray_results/tune_exp/TrainKMNIST_38678fd4_2_lr=0.013787,momentum=0.1362_2021-03-12_09-13-35/ /home/ubuntu/ray_results/tune_exp/TrainKMNIST_38678fd4_2_lr=0.013787,momentum=0.1362_2021-03-12_09-13-35/
Error message (255): packet_write_wait: Connection to 10.132.15.230 port 22: Broken pipe
rsync: connection unexpectedly closed (0 bytes received so far) [Receiver]
rsync error: unexplained error (code 255) at io.c(235) [Receiver=3.1.3]

2021-03-12 09:14:07,065	ERROR trial_runner.py:727 -- Trial TrainKMNIST_38678fd4: Error processing event.
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/trial_runner.py", line 697, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/ray_trial_executor.py", line 678, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 47, in wrapper
    return func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/worker.py", line 1442, in get
    raise value
ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task. Check python-core-worker-*.log files for more information.
2021-03-12 09:14:07,067	INFO trial_runner.py:1002 -- Trial TrainKMNIST_38678fd4: Attempting to restore trial state from last checkpoint.
2021-03-12 09:14:07,078	ERROR trial_runner.py:727 -- Trial TrainKMNIST_389cc24e: Error processing event.
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/trial_runner.py", line 697, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/ray_trial_executor.py", line 678, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 47, in wrapper
    return func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/worker.py", line 1442, in get
    raise value
ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task. Check python-core-worker-*.log files for more information.
2021-03-12 09:14:07,079	INFO trial_runner.py:1002 -- Trial TrainKMNIST_389cc24e: Attempting to restore trial state from last checkpoint.

And I get that 2 trials have been interrupted (which is ok):

== Status ==
Memory usage on this node: 1.5/14.7 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 64.000: 0.5984375 | Iter 32.000: 0.6 | Iter 16.000: 0.51875 | Iter 8.000: 0.2859375
Resources requested: 12.0/12 CPUs, 0/0 GPUs, 0.0/28.68 GiB heap, 0.0/12.9 GiB objects
Result logdir: /home/ubuntu/ray_results/tune_exp
Number of trials: 17/20 (6 PENDING, 6 RUNNING, 5 TERMINATED)
+----------------------+------------+-----------------+-------------+-------------+----------+--------+------------------+
| Trial name           | status     | loc             |          lr |    momentum |      acc |   iter |   total time (s) |
|----------------------+------------+-----------------+-------------+-------------+----------+--------+------------------|
| TrainKMNIST_38678fd4 | RUNNING    |                 | 0.0137873   | 0.136199    | 0.615625 |     30 |          9.81758 |
| TrainKMNIST_386b2716 | RUNNING    | 10.132.0.4:8637 | 0.0290271   | 0.107078    | 0.65     |     25 |          8.1779  |
| TrainKMNIST_386f4166 | RUNNING    | 10.132.0.3:9523 | 0.00513989  | 0.254272    | 0.640625 |     73 |         22.7214  |
| TrainKMNIST_38745340 | RUNNING    | 10.132.0.3:9525 | 0.0155962   | 0.0215544   | 0.65     |     67 |         21.5347  |
| TrainKMNIST_389cc24e | RUNNING    |                 | 0.000340849 | 0.753652    | 0.05625  |      4 |          1.2438  |
| TrainKMNIST_38a6a05c | RUNNING    | 10.132.0.4:8642 | 0.00679255  | 0.478938    | 0.415625 |      8 |          2.6559  |
| TrainKMNIST_38b1b6a4 | PENDING    |                 | 0.000877999 | 0.244587    |          |        |                  |
| TrainKMNIST_41cabace | PENDING    |                 | 0.000178092 | 0.241587    |          |        |                  |
| TrainKMNIST_44bfa488 | PENDING    |                 | 0.00015556  | 0.13273     |          |        |                  |
| TrainKMNIST_461e2cc8 | PENDING    |                 | 0.307777    | 0.73092     |          |        |                  |
| TrainKMNIST_48b015be | PENDING    |                 | 0.108249    | 0.413707    |          |        |                  |
| TrainKMNIST_4a8ef710 | PENDING    |                 | 1e-06       | 1.63526e-16 |          |        |                  |
| TrainKMNIST_38653220 | TERMINATED |                 | 1.22015e-06 | 0.929689    | 0.134375 |      8 |          2.47136 |
| TrainKMNIST_387a0f74 | TERMINATED |                 | 0.00282857  | 0.324133    | 0.28125  |      8 |          2.58102 |
| TrainKMNIST_3880b32e | TERMINATED |                 | 0.00438698  | 0.601976    | 0.290625 |      8 |          2.48458 |
| TrainKMNIST_38891a1e | TERMINATED |                 | 0.000163046 | 0.813097    | 0.08125  |      8 |          2.53252 |
| TrainKMNIST_38934516 | TERMINATED |                 | 1.159e-05   | 0.653644    | 0.15     |      8 |          2.57075 |
+----------------------+------------+-----------------+-------------+-------------+----------+--------+------------------+
Number of errored trials: 2
+----------------------+--------------+------------------------------------------------------------------------------------------------------------------------+
| Trial name           |   # failures | error file                                                                                                             |
|----------------------+--------------+------------------------------------------------------------------------------------------------------------------------|
| TrainKMNIST_38678fd4 |            1 | /home/ubuntu/ray_results/tune_exp/TrainKMNIST_38678fd4_2_lr=0.013787,momentum=0.1362_2021-03-12_09-13-35/error.txt     |
| TrainKMNIST_389cc24e |            1 | /home/ubuntu/ray_results/tune_exp/TrainKMNIST_389cc24e_10_lr=0.00034085,momentum=0.75365_2021-03-12_09-13-57/error.txt |
+----------------------+--------------+------------------------------------------------------------------------------------------------------------------------+

Now, what I expected was that, when the resources would have been available again, trial TrainKMNIST_38678fd4 would have continued from iter 30, but I get:

== Status ==
Memory usage on this node: 1.5/14.7 GiB
Using AsyncHyperBand: num_stopped=15
Bracket: Iter 64.000: 0.640625 | Iter 32.000: 0.6375 | Iter 16.000: 0.540625 | Iter 8.000: 0.215625
Resources requested: 10.0/8 CPUs, 0/0 GPUs, 0.0/18.64 GiB heap, 0.0/8.6 GiB objects
Result logdir: /home/ubuntu/ray_results/tune_exp
Number of trials: 20/20 (5 RUNNING, 15 TERMINATED)
+----------------------+------------+------------------+-------------+-------------+----------+--------+------------------+
| Trial name           | status     | loc              |          lr |    momentum |      acc |   iter |   total time (s) |
|----------------------+------------+------------------+-------------+-------------+----------+--------+------------------|
| TrainKMNIST_38678fd4 | RUNNING    | 10.132.0.3:10108 | 0.0137873   | 0.136199    | 0.16875  |      2 |         0.608118 |
| TrainKMNIST_386b2716 | RUNNING    | 10.132.0.4:8637  | 0.0290271   | 0.107078    | 0.75     |     93 |        30.8446   |
| TrainKMNIST_389cc24e | RUNNING    |                  | 0.000340849 | 0.753652    | 0.05625  |      4 |         1.2438   |
| TrainKMNIST_48b015be | RUNNING    | 10.132.0.4:9424  | 0.108249    | 0.413707    | 0.734375 |     32 |         9.62748  |
| TrainKMNIST_59939a36 | RUNNING    | 10.132.0.3:10111 | 0.0179864   | 0.35667     | 0.534375 |     12 |         3.80412  |
| TrainKMNIST_38653220 | TERMINATED |                  | 1.22015e-06 | 0.929689    | 0.134375 |      8 |         2.47136  |
| TrainKMNIST_386f4166 | TERMINATED |                  | 0.00513989  | 0.254272    | 0.628125 |    100 |        31.0911   |
| TrainKMNIST_38745340 | TERMINATED |                  | 0.0155962   | 0.0215544   | 0.65625  |    100 |        32.2521   |
| TrainKMNIST_387a0f74 | TERMINATED |                  | 0.00282857  | 0.324133    | 0.28125  |      8 |         2.58102  |
| TrainKMNIST_3880b32e | TERMINATED |                  | 0.00438698  | 0.601976    | 0.290625 |      8 |         2.48458  |
| TrainKMNIST_38891a1e | TERMINATED |                  | 0.000163046 | 0.813097    | 0.08125  |      8 |         2.53252  |
| TrainKMNIST_38934516 | TERMINATED |                  | 1.159e-05   | 0.653644    | 0.15     |      8 |         2.57075  |
| TrainKMNIST_38a6a05c | TERMINATED |                  | 0.00679255  | 0.478938    | 0.55     |     32 |        10.4806   |
| TrainKMNIST_38b1b6a4 | TERMINATED |                  | 0.000877999 | 0.244587    | 0.090625 |      8 |         2.5811   |
| TrainKMNIST_41cabace | TERMINATED |                  | 0.000178092 | 0.241587    | 0.128125 |      8 |         2.60457  |
| TrainKMNIST_44bfa488 | TERMINATED |                  | 0.00015556  | 0.13273     | 0.10625  |      8 |         2.58267  |
| TrainKMNIST_461e2cc8 | TERMINATED |                  | 0.307777    | 0.73092     | 0.528125 |     16 |         5.3081   |
| TrainKMNIST_4a8ef710 | TERMINATED |                  | 1e-06       | 1.63526e-16 | 0.090625 |      8 |         2.59913  |
| TrainKMNIST_559ee354 | TERMINATED |                  | 0.4         | 2.33851e-17 | 0.096875 |      8 |         2.83368  |
| TrainKMNIST_56762aa8 | TERMINATED |                  | 0.4         | 0.261693    | 0.084375 |      8 |         2.52533  |
+----------------------+------------+------------------+-------------+-------------+----------+--------+------------------+
Number of errored trials: 2
+----------------------+--------------+------------------------------------------------------------------------------------------------------------------------+
| Trial name           |   # failures | error file                                                                                                             |
|----------------------+--------------+------------------------------------------------------------------------------------------------------------------------|
| TrainKMNIST_38678fd4 |            2 | /home/ubuntu/ray_results/tune_exp/TrainKMNIST_38678fd4_2_lr=0.013787,momentum=0.1362_2021-03-12_09-13-35/error.txt     |
| TrainKMNIST_389cc24e |            2 | /home/ubuntu/ray_results/tune_exp/TrainKMNIST_389cc24e_10_lr=0.00034085,momentum=0.75365_2021-03-12_09-13-57/error.txt |
+----------------------+--------------+------------------------------------------------------------------------------------------------------------------------+

It seems that it did not used the checkpoints.
I have seen that this behavior happens only sometimes, with the same script sometimes trials recover and sometimes they restart from the beginning.

This is my real issue.

Moreover, if I set checkpoint_at_end = True, I get:

18169856it [00:20, 3355911.04it/s]                              
2021-03-12 10:56:28,786 ERROR syncer.py:190 -- Sync execution failed.
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/syncer.py", line 186, in sync_down
    result = self.sync_client.sync_down(self._remote_path,
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/sync_client.py", line 212, in sync_down
    return self._execute(self.sync_down_template, source, target)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/sync_client.py", line 271, in _execute
    stdout=self._get_logfile())
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/sync_client.py", line 201, in _get_logfile
    raise RuntimeError(
RuntimeError: [internalerror] The client has been closed. Please report this stacktrace + your cluster configuration on Github!
2021-03-12 10:56:28,787 ERROR syncer.py:413 -- Trial TrainKMNIST_8bbe4516: Checkpoint sync skipped. This should not happen.
2021-03-12 10:56:28,788 ERROR trial_runner.py:894 -- Trial TrainKMNIST_8bbe4516: Error handling checkpoint /home/ubuntu/ray_results/tune_exp/TrainKMNIST_8bbe4516_5_lr=0.094173,momentum=0.98229_2021-03-12_10-56-07/checkpoint_000008/model.pth
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/trial_runner.py", line 886, in _process_trial_save
    self._callbacks.on_checkpoint(
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/callback.py", line 216, in on_checkpoint
    callback.on_checkpoint(**info)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/syncer.py", line 455, in on_checkpoint
    self._sync_trial_checkpoint(trial, checkpoint)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/syncer.py", line 428, in _sync_trial_checkpoint
    raise TuneError("Trial {}: Checkpoint path {} not "
ray.tune.error.TuneError: Trial TrainKMNIST_8bbe4516: Checkpoint path /home/ubuntu/ray_results/tune_exp/TrainKMNIST_8bbe4516_5_lr=0.094173,momentum=0.98229_2021-03-12_10-56-07/checkpoint_000008/model.pth not found after successful sync down.

While, if I set resume=LOCAL (or anything else), I get:

2021-03-12 11:03:18,823 WARNING trial_runner.py:422 -- Attempting to resume experiment from /home/ubuntu/ray_results/tune_exp. This will ignore any new changes to the specification.
2021-03-12 11:03:18,824 ERROR trial_runner.py:279 -- Searcher unable to find checkpoint in /home/ubuntu/ray_results/tune_exp
2021-03-12 11:03:18,824 ERROR trial_runner.py:280 -- Runner restore failed.
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/trial_runner.py", line 275, in __init__
    self.resume(run_errored_only=errored_only)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/trial_runner.py", line 430, in resume
    self._search_alg.restore_from_dir(self._local_checkpoint_dir)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/suggest/search_generator.py", line 213, in restore_from_dir
    base_searcher.restore_from_dir(dirpath)
  File "/home/ubuntu/anaconda3/lib/python3.8/site-packages/ray/tune/suggest/suggestion.py", line 297, in restore_from_dir
    raise RuntimeError(
RuntimeError: Searcher unable to find checkpoint in /home/ubuntu/ray_results/tune_exp

@rliaw can I resume an experiment, when my Trainable does not implement save_checkpoint and load_checkpoint in ray==1.11?

2022-05-13 18:31:52,041	WARNING trial_runner.py:280 -- The maximum number of pending trials has been automatically set to the number of available cluster CPUs, which is high (281 CPUs/pending trials). If you're running an experiment with a large number of trials, this could lead to scheduling overhead. In this case, consider setting the `TUNE_MAX_PENDING_TRIALS_PG` environment variable to the desired maximum number of concurrent trials.
2022-05-13 18:31:52,043	INFO trial_runner.py:520 -- Downloading experiment checkpoint from hdfs:///<SOME_HDFS_PATH>/MyTrainable_2022-05-10_01-13-45
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_254/3842597904.py in <module>
      6     name="MyTrainable_2022-05-10_01-13-45",
      7     resume="REMOTE",
----> 8     stop={"training_iteration": 1}
      9 )

/tmp/ipykernel_254/1153959889.py in run_tuner(train_config, search_id, **tune_run_kwargs)
     28             sync_period=hdfs_sync_interval_in_min * 60 # only syncs from head node and not from all workers at this interval
     29         ),
---> 30         **tune_run_kwargs
     31     )
     32     return analysis

~/.local/lib/python3.7/site-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, max_concurrent_trials, queue_trials, loggers, _remote)
    546         # Driver should only sync trial checkpoints if
    547         # checkpoints are not synced to cloud
--> 548         driver_sync_trial_checkpoints=not bool(sync_config.upload_dir))
    549 
    550     if not runner.resumed:

~/.local/lib/python3.7/site-packages/ray/tune/trial_runner.py in __init__(self, search_alg, scheduler, local_checkpoint_dir, remote_checkpoint_dir, sync_config, stopper, resume, server_port, fail_fast, checkpoint_period, trial_executor, callbacks, metric, driver_sync_trial_checkpoints)
    348         if self._validate_resume(
    349                 resume_type=resume,
--> 350                 driver_sync_trial_checkpoints=driver_sync_trial_checkpoints):
    351             errored_only = False
    352             if isinstance(resume, str):

~/.local/lib/python3.7/site-packages/ray/tune/trial_runner.py in _validate_resume(self, resume_type, driver_sync_trial_checkpoints)
    539 
    540             if not self.checkpoint_exists(self._local_checkpoint_dir):
--> 541                 raise ValueError("Called resume when no checkpoint exists "
    542                                  "in remote or local directory.")
    543         return True

ValueError: Called resume when no checkpoint exists in remote or local directory.

Unfortunately that won’t work…

1 Like