Hello,
I’m running a Tuner job with Ray 2.5.1 on Ubuntu. I have been getting the following error pretty regularly (maybe 1 of 4 trials). I am saving checkpoints every 10 iterations and the final checkpoint. Since there are JSON files in checkpoint dirs, and I am not doing anything else with JSON, I have to suspect this is a checkpointing problem. Any advice on how to further investigate or how to deal with it? For the time being, I’m reducing the checkpoint activity in hopes of minimizing the odds. Thank you.
*** SIGSEGV received at time=1697670253 on cpu 6 ***
PC: @ 0x50d247 (unknown) list_iter
@ 0x7fdaf7e3a420 (unknown) (unknown)
[2023-10-18 19:04:13,049 E 7073 7073] logging.cc:361: *** SIGSEGV received at time=1697670253 on cpu 6 ***
[2023-10-18 19:04:13,049 E 7073 7073] logging.cc:361: PC: @ 0x50d247 (unknown) list_iter
[2023-10-18 19:04:13,049 E 7073 7073] logging.cc:361: @ 0x7fdaf7e3a420 (unknown) (unknown)
Fatal Python error: Segmentation fault
Stack (most recent call first):
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/collections/__init__.py", line 981 in __getitem__
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 733 in dump
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 88 in dumps
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/site-packages/ray/tune/utils/serialization.py", line 28 in _to_cloudpickle
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/site-packages/ray/tune/utils/serialization.py", line 23 in default
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/json/encoder.py", line 438 in _iterencode
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/json/encoder.py", line 405 in _iterencode_dict
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/json/encoder.py", line 405 in _iterencode_dict
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/json/encoder.py", line 431 in _iterencode
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/json/__init__.py", line 179 in dump
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/site-packages/ray/tune/execution/trial_runner.py", line 376 in save_to_dir
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/site-packages/ray/tune/execution/experiment_state.py", line 232 in checkpoint
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/site-packages/ray/tune/execution/trial_runner.py", line 491 in checkpoint
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/site-packages/ray/tune/execution/tune_controller.py", line 269 in step
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/site-packages/ray/tune/tune.py", line 1070 in run
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/site-packages/ray/tune/impl/tuner_internal.py", line 712 in _fit_internal
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/site-packages/ray/tune/impl/tuner_internal.py", line 588 in fit
File "/home/starkj/miniconda3/envs/cda0/lib/python3.10/site-packages/ray/tune/tuner.py", line 347 in fit
File "/home/starkj/projects/cda1/staging/tune.py", line 182 in main
File "/home/starkj/projects/cda1/staging/tune.py", line 195 in <module>
Extension modules: msgpack._cmsgpack, setproctitle, google._upb._message, psutil._psutil_linux, psutil._psutil_posix, yaml._yaml, grpc._cython.cygrpc, ray._raylet, mkl._mklinit, mkl._py_mkl_service, numpy.core._multiarray_umath, numpy.core._multiarray_tests, numpy.linalg._umath_linalg, numpy.fft._pocketfft_internal, numpy.random._common, numpy.random.bit_generator, numpy.random._bounded_integers, numpy.random._mt19937, numpy.random.mtrand, numpy.random._philox, numpy.random._pcg64, numpy.random._sfc64, numpy.random._generator, pyarrow.lib, pyarrow._hdfsio, pyarrow._fs, pyarrow._hdfs, pyarrow._gcsfs, pyarrow._s3fs, pandas._libs.tslibs.np_datetime, pandas._libs.tslibs.dtypes, pandas._libs.tslibs.base, pandas._libs.tslibs.nattype, pandas._libs.tslibs.timezones, pandas._libs.tslibs.ccalendar, pandas._libs.tslibs.fields, pandas._libs.tslibs.timedeltas, pandas._libs.tslibs.tzconversion, pandas._libs.tslibs.timestamps, pandas._libs.properties, pandas._libs.tslibs.offsets, pandas._libs.tslibs.strptime, pandas._libs.tslibs.parsing, pandas._libs.tslibs.conversion, pandas._libs.tslibs.period, pandas._libs.tslibs.vectorized, pandas._libs.ops_dispatch, pandas._libs.missing, pandas._libs.hashtable, pandas._libs.algos, pandas._libs.interval, pandas._libs.lib, pandas._libs.hashing, pandas._libs.tslib, pandas._libs.ops, pyarrow._compute, pandas._libs.arrays, pandas._libs.sparse, pandas._libs.reduction, pandas._libs.indexing, pandas._libs.index, pandas._libs.internals, pandas._libs.join, pandas._libs.writers, pandas._libs.window.aggregations, pandas._libs.window.indexers, pandas._libs.reshape, pandas._libs.groupby, pandas._libs.testing, pandas._libs.parsers, pandas._libs.json, _cffi_backend, tensorflow.python.framework.fast_tensor_util, h5py._errors, h5py.defs, h5py._objects, h5py.h5, h5py.h5r, h5py.utils, h5py.h5s, h5py.h5ac, h5py.h5p, h5py.h5t, h5py._conv, h5py.h5z, h5py._proxy, h5py.h5a, h5py.h5d, h5py.h5ds, h5py.h5g, h5py.h5i, h5py.h5f, h5py.h5fd, h5py.h5pl, h5py.h5o, h5py.h5l, h5py._selector, scipy._lib._ccallback_c, scipy.sparse._sparsetools, _csparsetools, scipy.sparse._csparsetools, scipy.sparse.linalg._isolve._iterative, scipy.linalg._fblas, scipy.linalg._flapack, scipy.linalg.cython_lapack, scipy.linalg._cythonized_array_utils, scipy.linalg._solve_toeplitz, scipy.linalg._decomp_lu_cython, scipy.linalg._matfuncs_sqrtm_triu, scipy.linalg.cython_blas, scipy.linalg._matfuncs_expm, scipy.linalg._decomp_update, scipy.linalg._flinalg, scipy.sparse.linalg._dsolve._superlu, scipy.sparse.linalg._eigen.arpack._arpack, scipy.sparse.csgraph._tools, scipy.sparse.csgraph._shortest_path, scipy.sparse.csgraph._traversal, scipy.sparse.csgraph._min_spanning_tree, scipy.sparse.csgraph._flow, scipy.sparse.csgraph._matching, scipy.sparse.csgraph._reordering, PIL._imaging, scipy.ndimage._nd_image, scipy.special._ufuncs_cxx, scipy.special._ufuncs, scipy.special._specfun, scipy.special._comb, scipy.special._ellip_harm_2, _ni_label, scipy.ndimage._ni_label, torch._C, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._nn, torch._C._sparse, torch._C._special, gmpy2.gmpy2, skimage._shared.geometry, scipy.spatial._ckdtree, scipy._lib.messagestream, scipy.spatial._qhull, scipy.spatial._voronoi, scipy.spatial._distance_wrap, scipy.spatial._hausdorff, scipy.spatial.transform._rotation, skimage.draw._draw, skimage.transform._hough_transform, scipy.interpolate._fitpack, scipy.interpolate.dfitpack, scipy.optimize._minpack2, scipy.optimize._group_columns, scipy.optimize._trlib._trlib, numpy.linalg.lapack_lite, scipy.optimize._lbfgsb, _moduleTNC, scipy.optimize._moduleTNC, scipy.optimize._cobyla, scipy.optimize._slsqp, scipy.optimize._minpack, scipy.optimize._lsq.givens_elimination, scipy.optimize._zeros, scipy.optimize.__nnls, scipy.optimize._highs.cython.src._highs_wrapper, scipy.optimize._highs._highs_wrapper, scipy.optimize._highs.cython.src._highs_constants, scipy.optimize._highs._highs_constants, scipy.linalg._interpolative, scipy.optimize._bglu_dense, scipy.optimize._lsap, scipy.optimize._direct, scipy.interpolate._bspl, scipy.interpolate._ppoly, scipy.interpolate.interpnd, scipy.interpolate._rbfinterp_pythran, scipy.interpolate._rgi_cython, scipy._lib._uarray._uarray, skimage.transform._warps_cy, skimage.measure._find_contours_cy, skimage.measure._marching_cubes_lewiner_cy, skimage.measure._moments_cy, scipy.signal._sigtools, scipy.signal._max_len_seq_inner, scipy.signal._upfirdn_apply, scipy.signal._spline, scipy.integrate._odepack, scipy.integrate._quadpack, scipy.integrate._vode, scipy.integrate._dop, scipy.integrate._lsoda, scipy.signal._sosfilt, scipy.signal._spectral, scipy.special.cython_special, scipy.stats._stats, scipy.stats.beta_ufunc, scipy.stats._boost.beta_ufunc, scipy.stats.binom_ufunc, scipy.stats._boost.binom_ufunc, scipy.stats.nbinom_ufunc, scipy.stats._boost.nbinom_ufunc, scipy.stats.hypergeom_ufunc, scipy.stats._boost.hypergeom_ufunc, scipy.stats.ncf_ufunc, scipy.stats._boost.ncf_ufunc, scipy.stats.ncx2_ufunc, scipy.stats._boost.ncx2_ufunc, scipy.stats.nct_ufunc, scipy.stats._boost.nct_ufunc, scipy.stats.skewnorm_ufunc, scipy.stats._boost.skewnorm_ufunc, scipy.stats.invgauss_ufunc, scipy.stats._boost.invgauss_ufunc, scipy.stats._biasedurn, scipy.stats._levy_stable.levyst, scipy.stats._stats_pythran, scipy.stats._statlib, scipy.stats._sobol, scipy.stats._qmc_cy, scipy.stats._mvn, scipy.stats._rcont.rcont, scipy.signal._peak_finding_utils, skimage.measure._pnpoly, skimage.measure._ccomp, skimage.transform._radon_transform, lz4._version, lz4.frame._frame, pyarrow._json (total: 228)
./train.sh: line 10: 7073 Segmentation fault (core dumped) python -u tune.py $1 > >(tee ~/tmp/log) 2> >(tee -a ~/tmp/log) 1>&2