Skip to content

v2-8 zarr issue #1905

@neverix

Description

@neverix

Code to reproduce: https://colab.research.google.com/drive/1p7lZZT_baqe4rtew8exc3F-ofxsAKe-R?usp=sharing

(writer::cache/validation pid=18821) 2025-08-08 01:22:54,469 - INFO - First group 0 finished. Copying other groups into permanent cache.
(_TreeStoreCacheBuilder pid=18612) 
cache/validation: copying: 100%|██████████| 1/1 [00:39<00:00, 39.77s/shard]
(writer::cache/validation pid=18821) 2025-08-08 01:22:56,269 - INFO - Cleaning up temporary cache at cache/validation/___temp
Traceback (most recent call last):
  File "/content/levanter/src/levanter/store/jagged_array.py", line 565, in _ts_open_sync
    ).result()
      ^^^^^^^^
ValueError: NOT_FOUND: Error opening "zarr3" driver: Metadata at local file "/content/levanter/cache/validation/input_ids/offsets/zarr.json" does not exist [source locations='tensorstore/driver/kvs_backed_chunk_driver.cc:1322\ntensorstore/driver/driver.cc:121'] [tensorstore_spec='{\"context\":{\"cache_pool\":{},\"data_copy_concurrency\":{},\"file_io_concurrency\":{},\"file_io_locking\":{},\"file_io_memmap\":false,\"file_io_sync\":true},\"driver\":\"zarr3\",\"dtype\":\"int64\",\"kvstore\":{\"driver\":\"file\",\"path\":\"/content/levanter/cache/validation/input_ids/offsets/\"},\"metadata\":{\"chunk_grid\":{\"configuration\":{\"chunk_shape\":[134217728]},\"name\":\"regular\"},\"codecs\":[{\"configuration\":{\"chunk_shape\":[262144],\"codecs\":[{\"configuration\":{\"clevel\":5},\"name\":\"blosc\"}]},\"name\":\"sharding_indexed\"}],\"node_type\":\"array\"},\"schema\":{\"domain\":{\"exclusive_max\":[18014398509481984],\"inclusive_min\":[0]}},\"transform\":{\"input_exclusive_max\":[[18014398509481984]],\"input_inclusive_min\":[0]}}']

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/content/levanter/src/levanter/main/train_lm.py", line 278, in <module>
    levanter.config.main(main)()
  File "/content/levanter/src/levanter/config.py", line 100, in wrapper_inner
    response = fn(cfg, *args, **kwargs)
               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/main/train_lm.py", line 183, in main
    cb = levanter.eval.cb_tagged_lm_evaluate(
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/eval.py", line 198, in cb_tagged_lm_evaluate
    evaluator = TaggedEvaluator(
                ^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/eval.py", line 298, in __init__
    self.loader = DataLoader(
                  ^^^^^^^^^^^
  File "/content/levanter/src/levanter/data/loader.py", line 127, in __init__
    current_len = blocking_wait(self.data_store.current_len())
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/utils/thread_utils.py", line 24, in blocking_wait
    return asyncio.run(coro)
           ^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/runners.py", line 190, in run
    return runner.run(main)
           ^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/base_events.py", line 654, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/eval.py", line 153, in current_len
    return await self.async_len()
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/eval.py", line 107, in async_len
    return int((await self._get_offsets())[-1])
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/eval.py", line 88, in _get_offsets
    lengths = await asyncio.gather(*[dataset.async_len() for dataset, _ in self.datasets])
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/data/text.py", line 179, in async_len
    return await self.dataset.async_len()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/data/text.py", line 108, in async_len
    await self.doc_cache.finished()
  File "/content/levanter/src/levanter/store/cache.py", line 409, in finished
    self._attempt_to_load_store(cache_metadata=False)
  File "/content/levanter/src/levanter/store/cache.py", line 417, in _attempt_to_load_store
    store = TreeStore.open(self._exemplar, self.cache_dir, mode="r", cache_metadata=cache_metadata)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/store/tree_store.py", line 61, in open
    tree = _construct_builder_tree(exemplar, path, mode, cache_metadata)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/store/tree_store.py", line 192, in _construct_builder_tree
    return jtu.tree_map_with_path(open_builder, exemplar, is_leaf=heuristic_is_leaf)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/.venv/lib/python3.11/site-packages/jax/_src/tree_util.py", line 1166, in tree_map_with_path
    return treedef.unflatten(f(*xs) for xs in zip(*all_keypath_leaves))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/.venv/lib/python3.11/site-packages/jax/_src/tree_util.py", line 1166, in <genexpr>
    return treedef.unflatten(f(*xs) for xs in zip(*all_keypath_leaves))
                             ^^^^^^
  File "/content/levanter/src/levanter/store/tree_store.py", line 184, in open_builder
    return JaggedArrayStore.open(
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/store/jagged_array.py", line 133, in open
    offsets = _ts_open_sync(offset_path, jnp.int64, [1], mode=mode, cache_settings=cache_settings)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/store/jagged_array.py", line 568, in _ts_open_sync
    raise FileNotFoundError(f"File not found: {path}") from e
FileNotFoundError: File not found: cache/validation/input_ids/offsets
Traceback (most recent call last):
  File "/content/levanter/src/levanter/store/jagged_array.py", line 565, in _ts_open_sync
    ).result()
      ^^^^^^^^
ValueError: NOT_FOUND: Error opening "zarr3" driver: Metadata at local file "/content/levanter/cache/validation/input_ids/offsets/zarr.json" does not exist [source locations='tensorstore/driver/kvs_backed_chunk_driver.cc:1322\ntensorstore/driver/driver.cc:121'] [tensorstore_spec='{\"context\":{\"cache_pool\":{},\"data_copy_concurrency\":{},\"file_io_concurrency\":{},\"file_io_locking\":{},\"file_io_memmap\":false,\"file_io_sync\":true},\"driver\":\"zarr3\",\"dtype\":\"int64\",\"kvstore\":{\"driver\":\"file\",\"path\":\"/content/levanter/cache/validation/input_ids/offsets/\"},\"metadata\":{\"chunk_grid\":{\"configuration\":{\"chunk_shape\":[134217728]},\"name\":\"regular\"},\"codecs\":[{\"configuration\":{\"chunk_shape\":[262144],\"codecs\":[{\"configuration\":{\"clevel\":5},\"name\":\"blosc\"}]},\"name\":\"sharding_indexed\"}],\"node_type\":\"array\"},\"schema\":{\"domain\":{\"exclusive_max\":[18014398509481984],\"inclusive_min\":[0]}},\"transform\":{\"input_exclusive_max\":[[18014398509481984]],\"input_inclusive_min\":[0]}}']

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/content/levanter/src/levanter/main/train_lm.py", line 278, in <module>
    levanter.config.main(main)()
  File "/content/levanter/src/levanter/config.py", line 100, in wrapper_inner
    response = fn(cfg, *args, **kwargs)
               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/main/train_lm.py", line 183, in main
    cb = levanter.eval.cb_tagged_lm_evaluate(
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/eval.py", line 198, in cb_tagged_lm_evaluate
    evaluator = TaggedEvaluator(
                ^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/eval.py", line 298, in __init__
    self.loader = DataLoader(
                  ^^^^^^^^^^^
  File "/content/levanter/src/levanter/data/loader.py", line 127, in __init__
    current_len = blocking_wait(self.data_store.current_len())
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/utils/thread_utils.py", line 24, in blocking_wait
    return asyncio.run(coro)
           ^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/runners.py", line 190, in run
    return runner.run(main)
           ^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/base_events.py", line 654, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/eval.py", line 153, in current_len
    return await self.async_len()
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/eval.py", line 107, in async_len
    return int((await self._get_offsets())[-1])
                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/eval.py", line 88, in _get_offsets
    lengths = await asyncio.gather(*[dataset.async_len() for dataset, _ in self.datasets])
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/data/text.py", line 179, in async_len
    return await self.dataset.async_len()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/data/text.py", line 108, in async_len
    await self.doc_cache.finished()
  File "/content/levanter/src/levanter/store/cache.py", line 409, in finished
    self._attempt_to_load_store(cache_metadata=False)
  File "/content/levanter/src/levanter/store/cache.py", line 417, in _attempt_to_load_store
    store = TreeStore.open(self._exemplar, self.cache_dir, mode="r", cache_metadata=cache_metadata)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/store/tree_store.py", line 61, in open
    tree = _construct_builder_tree(exemplar, path, mode, cache_metadata)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/store/tree_store.py", line 192, in _construct_builder_tree
    return jtu.tree_map_with_path(open_builder, exemplar, is_leaf=heuristic_is_leaf)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/.venv/lib/python3.11/site-packages/jax/_src/tree_util.py", line 1166, in tree_map_with_path
    return treedef.unflatten(f(*xs) for xs in zip(*all_keypath_leaves))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/.venv/lib/python3.11/site-packages/jax/_src/tree_util.py", line 1166, in <genexpr>
    return treedef.unflatten(f(*xs) for xs in zip(*all_keypath_leaves))
                             ^^^^^^
  File "/content/levanter/src/levanter/store/tree_store.py", line 184, in open_builder
    return JaggedArrayStore.open(
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/store/jagged_array.py", line 133, in open
    offsets = _ts_open_sync(offset_path, jnp.int64, [1], mode=mode, cache_settings=cache_settings)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/levanter/src/levanter/store/jagged_array.py", line 568, in _ts_open_sync
    raise FileNotFoundError(f"File not found: {path}") from e
FileNotFoundError: File not found: cache/validation/input_ids/offsets
2025-08-08T01:22:57 - 0 - levanter.distributed - distributed.py:305 - INFO :: Shutting down ray...

Metadata

Metadata

Assignees

No one assigned

    Labels

    levanterIssues related to Levanter library

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions