Hello I am trying to read JPG images from s3 and however getting below error. I am using
Ray 2.6.0
Pillow 9.5.0
pyarrow: 12.0.1
numpy: 1.21.6
My code:
import ray
from IPython.display import display
if ray.is_initialized():
ray.shutdown()
ray.init()
print(ray.cluster_resources())
s3_uri = "s3://data-science/data-preparation/test"
ds = ray.data.read_images(s3_uri, include_paths=True)
display(ds)
ds.schema()
This is how images are stored on S3.
/usr/local/bin/python3.7 /Users/anup.rawka/IdeaProjects/data-science/raysdk/streaming_data_train.py
PIL: 9.5.0
2023-08-01 11:57:10,517 INFO worker.py:1621 -- Started a local Ray instance.
{'memory': 5961287680.0, 'CPU': 12.0, 'node:127.0.0.1': 1.0, 'object_store_memory': 2147483648.0, 'node:__internal_head__': 1.0}
2023-08-01 11:57:16,159 INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadImage]
2023-08-01 11:57:16,159 INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-08-01 11:57:16,159 INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
(ReadImage pid=72712) Task failed with retryable exception: TaskID(4bd1bc551048d2a5ffffffffffffffffffffffff01000000).
(ReadImage pid=72712) Traceback (most recent call last):
(ReadImage pid=72712) File "python/ray/_raylet.pyx", line 1191, in ray._raylet.execute_dynamic_generator_and_store_task_outputs
(ReadImage pid=72712) File "python/ray/_raylet.pyx", line 3667, in ray._raylet.CoreWorker.store_task_outputs
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 415, in _map_task
(ReadImage pid=72712) for b_out in fn(iter(blocks), ctx):
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/planner/plan_read_op.py", line 67, in do_read
(ReadImage pid=72712) yield from read_task()
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/datasource.py", line 214, in __call__
(ReadImage pid=72712) for block in result:
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/file_based_datasource.py", line 478, in read_files
(ReadImage pid=72712) for data in read_stream(f, read_path, **reader_args):
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/file_based_datasource.py", line 242, in _read_stream
(ReadImage pid=72712) yield self._read_file(f, path, **reader_args)
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/image_datasource.py", line 85, in _read_file
(ReadImage pid=72712) image = Image.open(io.BytesIO(data))
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/PIL/Image.py", line 3298, in open
(ReadImage pid=72712) raise UnidentifiedImageError(msg)
(ReadImage pid=72712) PIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x19c7437d0>
(ReadImage pid=72712) Task failed with retryable exception: TaskID(4bd1bc551048d2a5ffffffffffffffffffffffff01000000).
(ReadImage pid=72712) Traceback (most recent call last):
(ReadImage pid=72712) File "python/ray/_raylet.pyx", line 1191, in ray._raylet.execute_dynamic_generator_and_store_task_outputs
(ReadImage pid=72712) File "python/ray/_raylet.pyx", line 3667, in ray._raylet.CoreWorker.store_task_outputs
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 415, in _map_task
(ReadImage pid=72712) for b_out in fn(iter(blocks), ctx):
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/planner/plan_read_op.py", line 67, in do_read
(ReadImage pid=72712) yield from read_task()
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/datasource.py", line 214, in __call__
(ReadImage pid=72712) for block in result:
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/file_based_datasource.py", line 478, in read_files
(ReadImage pid=72712) for data in read_stream(f, read_path, **reader_args):
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/file_based_datasource.py", line 242, in _read_stream
(ReadImage pid=72712) yield self._read_file(f, path, **reader_args)
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/image_datasource.py", line 85, in _read_file
(ReadImage pid=72712) image = Image.open(io.BytesIO(data))
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/PIL/Image.py", line 3298, in open
(ReadImage pid=72712) raise UnidentifiedImageError(msg)
(ReadImage pid=72712) PIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x19ca14170>
(ReadImage pid=72712) Task failed with retryable exception: TaskID(4bd1bc551048d2a5ffffffffffffffffffffffff01000000).
(ReadImage pid=72712) Traceback (most recent call last):
(ReadImage pid=72712) File "python/ray/_raylet.pyx", line 1191, in ray._raylet.execute_dynamic_generator_and_store_task_outputs
(ReadImage pid=72712) File "python/ray/_raylet.pyx", line 3667, in ray._raylet.CoreWorker.store_task_outputs
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 415, in _map_task
(ReadImage pid=72712) for b_out in fn(iter(blocks), ctx):
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/planner/plan_read_op.py", line 67, in do_read
(ReadImage pid=72712) yield from read_task()
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/datasource.py", line 214, in __call__
(ReadImage pid=72712) for block in result:
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/file_based_datasource.py", line 478, in read_files
(ReadImage pid=72712) for data in read_stream(f, read_path, **reader_args):
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/file_based_datasource.py", line 242, in _read_stream
(ReadImage pid=72712) yield self._read_file(f, path, **reader_args)
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/image_datasource.py", line 85, in _read_file
(ReadImage pid=72712) image = Image.open(io.BytesIO(data))
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/PIL/Image.py", line 3298, in open
(ReadImage pid=72712) raise UnidentifiedImageError(msg)
(ReadImage pid=72712) PIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x19cbe7dd0>
(ReadImage pid=72712) Task failed with retryable exception: TaskID(4bd1bc551048d2a5ffffffffffffffffffffffff01000000). [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)
(ReadImage pid=72712) Traceback (most recent call last): [repeated 2x across cluster]
(ReadImage pid=72712) File "python/ray/_raylet.pyx", line 1191, in ray._raylet.execute_dynamic_generator_and_store_task_outputs [repeated 2x across cluster]
(ReadImage pid=72712) File "python/ray/_raylet.pyx", line 3667, in ray._raylet.CoreWorker.store_task_outputs [repeated 2x across cluster]
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 415, in _map_task [repeated 2x across cluster]
(ReadImage pid=72712) for b_out in fn(iter(blocks), ctx): [repeated 2x across cluster]
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/planner/plan_read_op.py", line 67, in do_read [repeated 2x across cluster]
(ReadImage pid=72712) yield from read_task() [repeated 2x across cluster]
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/datasource.py", line 214, in __call__ [repeated 2x across cluster]
(ReadImage pid=72712) for block in result: [repeated 2x across cluster]
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/file_based_datasource.py", line 478, in read_files [repeated 2x across cluster]
(ReadImage pid=72712) for data in read_stream(f, read_path, **reader_args): [repeated 2x across cluster]
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/file_based_datasource.py", line 242, in _read_stream [repeated 2x across cluster]
(ReadImage pid=72712) yield self._read_file(f, path, **reader_args) [repeated 2x across cluster]
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/image_datasource.py", line 85, in _read_file [repeated 2x across cluster]
(ReadImage pid=72712) image = Image.open(io.BytesIO(data)) [repeated 2x across cluster]
(ReadImage pid=72712) File "/usr/local/lib/python3.7/site-packages/PIL/Image.py", line 3298, in open [repeated 2x across cluster]
(ReadImage pid=72712) raise UnidentifiedImageError(msg) [repeated 2x across cluster]
(ReadImage pid=72712) PIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x19cbe7e90> [repeated 2x across cluster]
(ReadImage pid=72705) builder.add(item)
(ReadImage pid=72705) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/delegating_block_builder.py", line 24, in add
(ReadImage pid=72705) check.build()
(ReadImage pid=72705) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/table_block.py", line 118, in build
(ReadImage pid=72705) tables = [self._table_from_pydict(columns)]
(ReadImage pid=72705) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/arrow_block.py", line 123, in _table_from_pydict
(ReadImage pid=72705) columns[col_name] = ArrowTensorArray.from_numpy(col)
(ReadImage pid=72705) File "/usr/local/lib/python3.7/site-packages/ray/air/util/tensor_extensions/arrow.py", line 312, in from_numpy
(ReadImage pid=72705) return ArrowVariableShapedTensorArray.from_numpy(arr)
(ReadImage pid=72705) File "/usr/local/lib/python3.7/site-packages/ray/air/util/tensor_extensions/arrow.py", line 717, in from_numpy
(ReadImage pid=72705) pa_dtype = pa.from_numpy_dtype(dtype)
(ReadImage pid=72705) File "pyarrow/types.pxi", line 4911, in pyarrow.lib.from_numpy_dtype
(ReadImage pid=72705) File "pyarrow/error.pxi", line 121, in pyarrow.lib.check_status
(ReadImage pid=72705) pyarrow.lib.ArrowNotImplementedError: Unsupported numpy type 17
Traceback (most recent call last):
File "/Users/anup.rawka/IdeaProjects/data-science/raysdk/streaming_data_train.py", line 70, in <module>
for batch in ds.iter_batches(batch_size=100, batch_format="numpy", prefetch_batches=10):
File "/usr/local/lib/python3.7/site-packages/ray/data/iterator.py", line 200, in iter_batches
prefetch_batches=prefetch_batches,
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/iter_batches.py", line 176, in iter_batches
next_batch = next(async_batch_iter)
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/util.py", line 289, in make_async_gen
raise next_item
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/util.py", line 266, in execute_computation
for item in fn(thread_safe_generator):
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/iter_batches.py", line 167, in _async_iter_batches
yield from extract_data_from_batch(batch_iter)
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/util.py", line 210, in extract_data_from_batch
for batch in batch_iter:
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/iter_batches.py", line 306, in restore_original_order
for batch in batch_iter:
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/util.py", line 289, in make_async_gen
raise next_item
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/util.py", line 266, in execute_computation
for item in fn(thread_safe_generator):
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/iter_batches.py", line 218, in threadpool_computations_format_collate
yield from formatted_batch_iter
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/util.py", line 158, in format_batches
for batch in block_iter:
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/util.py", line 246, in __next__
return next(self.it)
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/util.py", line 117, in blocks_to_batches
for block in block_iter:
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/util.py", line 54, in resolve_block_refs
for block_ref in block_ref_iter:
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/iter_batches.py", line 271, in prefetch_batches_locally
next_block_ref_and_metadata = next(block_ref_iter)
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/block_batching/util.py", line 246, in __next__
return next(self.it)
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/legacy_compat.py", line 51, in execute_to_legacy_block_iterator
for bundle in bundle_iter:
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/interfaces.py", line 548, in __next__
return self.get_next()
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/streaming_executor.py", line 129, in get_next
raise item
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/streaming_executor.py", line 187, in run
while self._scheduling_loop_step(self._topology) and not self._shutdown:
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/streaming_executor.py", line 235, in _scheduling_loop_step
process_completed_tasks(topology)
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/streaming_executor_state.py", line 333, in process_completed_tasks
op.notify_work_completed(ref)
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/operators/task_pool_map_operator.py", line 65, in notify_work_completed
task.output = self._map_ref_to_ref_bundle(ref)
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 357, in _map_ref_to_ref_bundle
all_refs = list(ray.get(ref))
File "/usr/local/lib/python3.7/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.7/site-packages/ray/_private/worker.py", line 2493, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(UnidentifiedImageError): ray::ReadImage() (pid=72712, ip=127.0.0.1)
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 415, in _map_task
for b_out in fn(iter(blocks), ctx):
File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/planner/plan_read_op.py", line 67, in do_read
yield from read_task()
File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/datasource.py", line 214, in __call__
for block in result:
File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/file_based_datasource.py", line 478, in read_files
for data in read_stream(f, read_path, **reader_args):
File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/file_based_datasource.py", line 242, in _read_stream
yield self._read_file(f, path, **reader_args)
File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/image_datasource.py", line 85, in _read_file
image = Image.open(io.BytesIO(data))
File "/usr/local/lib/python3.7/site-packages/PIL/Image.py", line 3298, in open
raise UnidentifiedImageError(msg)
PIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x19cbe7e90>
(ReadImage pid=72705) Task failed with retryable exception: TaskID(82d7af811dbf3277ffffffffffffffffffffffff01000000).
(ReadImage pid=72705) Traceback (most recent call last):
(ReadImage pid=72705) File "python/ray/_raylet.pyx", line 1191, in ray._raylet.execute_dynamic_generator_and_store_task_outputs
(ReadImage pid=72705) File "python/ray/_raylet.pyx", line 3667, in ray._raylet.CoreWorker.store_task_outputs
(ReadImage pid=72705) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 415, in _map_task
(ReadImage pid=72705) for b_out in fn(iter(blocks), ctx):
(ReadImage pid=72705) File "/usr/local/lib/python3.7/site-packages/ray/data/_internal/planner/plan_read_op.py", line 67, in do_read
(ReadImage pid=72705) yield from read_task()
(ReadImage pid=72705) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/datasource.py", line 214, in __call__
(ReadImage pid=72705) for block in result:
(ReadImage pid=72705) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/file_based_datasource.py", line 478, in read_files
(ReadImage pid=72705) for data in read_stream(f, read_path, **reader_args):
(ReadImage pid=72705) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/file_based_datasource.py", line 242, in _read_stream
(ReadImage pid=72705) yield self._read_file(f, path, **reader_args)
(ReadImage pid=72705) File "/usr/local/lib/python3.7/site-packages/ray/data/datasource/image_datasource.py", line 98, in _read_file
Process finished with exit code 1