Changing object detection example to use local files

I’ve successfully followed Fine-tuning a Torch object detection model — Ray 2.9.3 to fine tune object detection. I have tried to update the code to use local images instead but get weird problems. I have tried to copy the annotations and images from the S3 bucket and saved them locally so they should be the same but to no avail.

The following code is different
S3 code from the guide:

class VOCAnnotationDatasource(FileBasedDatasource):
    def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator["pd.DataFrame"]:
        text = f.read().decode("utf-8")
        annotation = xmltodict.parse(text)["annotation"]

        objects = annotation["object"]
        # If there's one object, `objects` is a `dict`; otherwise, it's a `list[dict]`.
        if isinstance(objects, dict):
            objects = [objects]

        boxes: List[Tuple] = []
        for obj in objects:
            x1 = float(obj["bndbox"]["xmin"])
            y1 = float(obj["bndbox"]["ymin"])
            x2 = float(obj["bndbox"]["xmax"])
            y2 = float(obj["bndbox"]["ymax"])
            boxes.append((x1, y1, x2, y2))

        labels: List[int] = [CLASS_TO_LABEL[obj["name"]] for obj in objects]

        filename = annotation["filename"]
        yield pd.DataFrame(
            {
                "boxes": TensorArray([boxes]),
                "labels": TensorArray([labels]),
                "filename": [filename],
            }
        )

    def _rows_per_file(self):
        return 1

annotations: ray.data.Dataset = ray.data.read_datasource(
    VOCAnnotationDatasource("s3://anonymous@air-example-data/AnimalDetection/Annotations")
)
def read_images(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
    images: List[np.ndarray] = []
    for filename in batch["filename"]:
        url = os.path.join("https://s3-us-west-2.amazonaws.com/air-example-data/AnimalDetection/JPEGImages", filename)
        response = requests.get(url)
        image = Image.open(io.BytesIO(response.content))
        images.append(np.array(image))
    batch["image"] = np.array(images, dtype=object)
    return batch

My try to use local files (not working):

class VOCAnnotationDatasource(FileBasedDatasource):
    def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator["pd.DataFrame"]:
        text = f.read().decode("utf-8")
        annotation = xmltodict.parse(text)["annotation"]

        objects = annotation["object"]
        # If there's one object, `objects` is a `dict`; otherwise, it's a `list[dict]`.
        if isinstance(objects, dict):
            objects = [objects]

        boxes: List[Tuple] = []
        for obj in objects:
            x1 = float(obj["bndbox"]["xmin"])
            y1 = float(obj["bndbox"]["ymin"])
            x2 = float(obj["bndbox"]["xmax"])
            y2 = float(obj["bndbox"]["ymax"])
            boxes.append((x1, y1, x2, y2))

        labels: List[int] = [CLASS_TO_LABEL[obj["name"]] for obj in objects]

        filename = annotation["filename"]

        yield pd.DataFrame(
            {
                "boxes": np.array([boxes]),
                "labels": np.array([labels]),
                "filename": [filename],
            }
        )

    def _rows_per_file(self):
        return 1

annotations: ray.data.Dataset = ray.data.read_datasource(
    VOCAnnotationDatasource("/local/absolut/path/Annotations")
)

def read_images(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
    images: List[np.ndarray] = []
    for filename in batch["filename"]:
        url = os.path.join("/media/mikael/data/Documents/DroneTrainDataset/JPEGImages", filename)
        image = Image.open(url)
        images.append(np.array(image))
        #images.append(image) tried this and many other things but no cigar
    batch["image"] = np.array(images, dtype=object)
    return batch

If I change the local path version to

def read_images(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
    return batch

It continues (but obviously crashes later since the image data is not there.

The actual error is

024-02-28 11:48:48,858 ERROR streaming_executor_state.py:496 -- An exception was raised from a task of operator "ReadVOCAnnotation->SplitBlocks(11)". Dataset execution will now abort. To ignore this exception and continue, set DataContext.max_errored_blocks.
Traceback (most recent call last):                                                                                  
  File "/home/userA/Documents/repos/image_classifier/theobjectdetect.py", line 118, in <module>
    train_dataset, test_dataset = dataset.train_test_split(0.2)
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/dataset.py", line 1809, in train_test_split
    return ds.split_proportionately([1 - test_size])
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/dataset.py", line 1738, in split_proportionately
    dataset_length = self.count()
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/dataset.py", line 2606, in count
    [get_num_rows.remote(block) for block in self.get_internal_block_refs()]
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/dataset.py", line 4779, in get_internal_block_refs
    blocks = self._plan.execute().get_blocks()
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/plan.py", line 628, in execute
    blocks = execute_to_legacy_block_list(
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/legacy_compat.py", line 126, in execute_to_legacy_block_list
    block_list = _bundles_to_block_list(bundles)
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/legacy_compat.py", line 411, in _bundles_to_block_list
    for ref_bundle in bundles:
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/interfaces/executor.py", line 37, in __next__
    return self.get_next()
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor.py", line 145, in get_next
    item = self._outer._output_node.get_output_blocking(
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor_state.py", line 320, in get_output_blocking
    raise self._exception
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor.py", line 212, in run
    while self._scheduling_loop_step(self._topology) and not self._shutdown:
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor.py", line 260, in _scheduling_loop_step
    num_errored_blocks = process_completed_tasks(
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor_state.py", line 497, in process_completed_tasks
    raise e from None
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor_state.py", line 464, in process_completed_tasks
    num_blocks_read = task.on_data_ready(
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py", line 102, in on_data_ready
    raise ex from None
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py", line 98, in on_data_ready
    ray.get(block_ref)
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/_private/worker.py", line 2624, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): ray::ReadVOCAnnotation->SplitBlocks(11)() (pid=289794, ip=10.12.12.5)
    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 430, in __call__
    for block in blocks:
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 371, in __call__
    for data in iter:
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 232, in __call__
    yield from self._block_fn(input, ctx)
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/_internal/planner/plan_read_op.py", line 90, in do_read
    yield from call_with_retry(
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/datasource/datasource.py", line 237, in __call__
    yield from result
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/datasource/file_based_datasource.py", line 308, in read_task_fn
    yield from read_files(read_paths)
  File "/home/userA/anaconda3/lib/python3.10/site-packages/ray/data/datasource/file_based_datasource.py", line 279, in read_files
    for block in read_stream(f, read_path):
  File "/home/userA/Documents/repos/image_classifier/theobjectdetect.py", line 74, in _read_stream
    yield pd.DataFrame(
  File "/home/userA/.local/lib/python3.10/site-packages/pandas/core/frame.py", line 709, in __init__
    mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
  File "/home/userA/.local/lib/python3.10/site-packages/pandas/core/internals/construction.py", line 481, in dict_to_mgr
    return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
  File "/home/userA/.local/lib/python3.10/site-packages/pandas/core/internals/construction.py", line 115, in arrays_to_mgr
    index = _extract_index(arrays)
  File "/home/userA/.local/lib/python3.10/site-packages/pandas/core/internals/construction.py", line 642, in _extract_index
    raise ValueError("Per-column arrays must each be 1-dimensional")
ValueError: Per-column arrays must each be 1-dimensional

Things I 've checked:

  • The shape and dimension between the two versions are the same
  • The image mode is the same (RBG)
  • A lot of different ways to load the images (that are correctly loaded - or at least the the image object seem correct with the right resolution etc)

That’s super weird. I’m not sure why it’d work with S3 but not locally.

What happens when you update the code to this? I suspect it should fix the issue.

        from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder

        builder = DelegatingBlockBuilder()
        builder.add_batch(
            {
                "boxes": np.array([boxes]),
                "labels": np.array([labels]),
                "filename": [filename],
            }
        )
        block = builder.build()

        yield block