-
Notifications
You must be signed in to change notification settings - Fork 56
Open
marin-community/levanter
#1274Labels
levanterIssues related to Levanter libraryIssues related to Levanter library
Description
Code to reproduce: https://colab.research.google.com/drive/1p7lZZT_baqe4rtew8exc3F-ofxsAKe-R?usp=sharing
(writer::cache/validation pid=18821) 2025-08-08 01:22:54,469 - INFO - First group 0 finished. Copying other groups into permanent cache.
(_TreeStoreCacheBuilder pid=18612)
cache/validation: copying: 100%|██████████| 1/1 [00:39<00:00, 39.77s/shard]
(writer::cache/validation pid=18821) 2025-08-08 01:22:56,269 - INFO - Cleaning up temporary cache at cache/validation/___temp
Traceback (most recent call last):
File "/content/levanter/src/levanter/store/jagged_array.py", line 565, in _ts_open_sync
).result()
^^^^^^^^
ValueError: NOT_FOUND: Error opening "zarr3" driver: Metadata at local file "/content/levanter/cache/validation/input_ids/offsets/zarr.json" does not exist [source locations='tensorstore/driver/kvs_backed_chunk_driver.cc:1322\ntensorstore/driver/driver.cc:121'] [tensorstore_spec='{\"context\":{\"cache_pool\":{},\"data_copy_concurrency\":{},\"file_io_concurrency\":{},\"file_io_locking\":{},\"file_io_memmap\":false,\"file_io_sync\":true},\"driver\":\"zarr3\",\"dtype\":\"int64\",\"kvstore\":{\"driver\":\"file\",\"path\":\"/content/levanter/cache/validation/input_ids/offsets/\"},\"metadata\":{\"chunk_grid\":{\"configuration\":{\"chunk_shape\":[134217728]},\"name\":\"regular\"},\"codecs\":[{\"configuration\":{\"chunk_shape\":[262144],\"codecs\":[{\"configuration\":{\"clevel\":5},\"name\":\"blosc\"}]},\"name\":\"sharding_indexed\"}],\"node_type\":\"array\"},\"schema\":{\"domain\":{\"exclusive_max\":[18014398509481984],\"inclusive_min\":[0]}},\"transform\":{\"input_exclusive_max\":[[18014398509481984]],\"input_inclusive_min\":[0]}}']
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/content/levanter/src/levanter/main/train_lm.py", line 278, in <module>
levanter.config.main(main)()
File "/content/levanter/src/levanter/config.py", line 100, in wrapper_inner
response = fn(cfg, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/main/train_lm.py", line 183, in main
cb = levanter.eval.cb_tagged_lm_evaluate(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/eval.py", line 198, in cb_tagged_lm_evaluate
evaluator = TaggedEvaluator(
^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/eval.py", line 298, in __init__
self.loader = DataLoader(
^^^^^^^^^^^
File "/content/levanter/src/levanter/data/loader.py", line 127, in __init__
current_len = blocking_wait(self.data_store.current_len())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/utils/thread_utils.py", line 24, in blocking_wait
return asyncio.run(coro)
^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/asyncio/runners.py", line 190, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/asyncio/base_events.py", line 654, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/eval.py", line 153, in current_len
return await self.async_len()
^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/eval.py", line 107, in async_len
return int((await self._get_offsets())[-1])
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/eval.py", line 88, in _get_offsets
lengths = await asyncio.gather(*[dataset.async_len() for dataset, _ in self.datasets])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/data/text.py", line 179, in async_len
return await self.dataset.async_len()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/data/text.py", line 108, in async_len
await self.doc_cache.finished()
File "/content/levanter/src/levanter/store/cache.py", line 409, in finished
self._attempt_to_load_store(cache_metadata=False)
File "/content/levanter/src/levanter/store/cache.py", line 417, in _attempt_to_load_store
store = TreeStore.open(self._exemplar, self.cache_dir, mode="r", cache_metadata=cache_metadata)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/store/tree_store.py", line 61, in open
tree = _construct_builder_tree(exemplar, path, mode, cache_metadata)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/store/tree_store.py", line 192, in _construct_builder_tree
return jtu.tree_map_with_path(open_builder, exemplar, is_leaf=heuristic_is_leaf)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/.venv/lib/python3.11/site-packages/jax/_src/tree_util.py", line 1166, in tree_map_with_path
return treedef.unflatten(f(*xs) for xs in zip(*all_keypath_leaves))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/.venv/lib/python3.11/site-packages/jax/_src/tree_util.py", line 1166, in <genexpr>
return treedef.unflatten(f(*xs) for xs in zip(*all_keypath_leaves))
^^^^^^
File "/content/levanter/src/levanter/store/tree_store.py", line 184, in open_builder
return JaggedArrayStore.open(
^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/store/jagged_array.py", line 133, in open
offsets = _ts_open_sync(offset_path, jnp.int64, [1], mode=mode, cache_settings=cache_settings)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/store/jagged_array.py", line 568, in _ts_open_sync
raise FileNotFoundError(f"File not found: {path}") from e
FileNotFoundError: File not found: cache/validation/input_ids/offsets
Traceback (most recent call last):
File "/content/levanter/src/levanter/store/jagged_array.py", line 565, in _ts_open_sync
).result()
^^^^^^^^
ValueError: NOT_FOUND: Error opening "zarr3" driver: Metadata at local file "/content/levanter/cache/validation/input_ids/offsets/zarr.json" does not exist [source locations='tensorstore/driver/kvs_backed_chunk_driver.cc:1322\ntensorstore/driver/driver.cc:121'] [tensorstore_spec='{\"context\":{\"cache_pool\":{},\"data_copy_concurrency\":{},\"file_io_concurrency\":{},\"file_io_locking\":{},\"file_io_memmap\":false,\"file_io_sync\":true},\"driver\":\"zarr3\",\"dtype\":\"int64\",\"kvstore\":{\"driver\":\"file\",\"path\":\"/content/levanter/cache/validation/input_ids/offsets/\"},\"metadata\":{\"chunk_grid\":{\"configuration\":{\"chunk_shape\":[134217728]},\"name\":\"regular\"},\"codecs\":[{\"configuration\":{\"chunk_shape\":[262144],\"codecs\":[{\"configuration\":{\"clevel\":5},\"name\":\"blosc\"}]},\"name\":\"sharding_indexed\"}],\"node_type\":\"array\"},\"schema\":{\"domain\":{\"exclusive_max\":[18014398509481984],\"inclusive_min\":[0]}},\"transform\":{\"input_exclusive_max\":[[18014398509481984]],\"input_inclusive_min\":[0]}}']
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/content/levanter/src/levanter/main/train_lm.py", line 278, in <module>
levanter.config.main(main)()
File "/content/levanter/src/levanter/config.py", line 100, in wrapper_inner
response = fn(cfg, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/main/train_lm.py", line 183, in main
cb = levanter.eval.cb_tagged_lm_evaluate(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/eval.py", line 198, in cb_tagged_lm_evaluate
evaluator = TaggedEvaluator(
^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/eval.py", line 298, in __init__
self.loader = DataLoader(
^^^^^^^^^^^
File "/content/levanter/src/levanter/data/loader.py", line 127, in __init__
current_len = blocking_wait(self.data_store.current_len())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/utils/thread_utils.py", line 24, in blocking_wait
return asyncio.run(coro)
^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/asyncio/runners.py", line 190, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/asyncio/base_events.py", line 654, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/eval.py", line 153, in current_len
return await self.async_len()
^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/eval.py", line 107, in async_len
return int((await self._get_offsets())[-1])
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/eval.py", line 88, in _get_offsets
lengths = await asyncio.gather(*[dataset.async_len() for dataset, _ in self.datasets])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/data/text.py", line 179, in async_len
return await self.dataset.async_len()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/data/text.py", line 108, in async_len
await self.doc_cache.finished()
File "/content/levanter/src/levanter/store/cache.py", line 409, in finished
self._attempt_to_load_store(cache_metadata=False)
File "/content/levanter/src/levanter/store/cache.py", line 417, in _attempt_to_load_store
store = TreeStore.open(self._exemplar, self.cache_dir, mode="r", cache_metadata=cache_metadata)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/store/tree_store.py", line 61, in open
tree = _construct_builder_tree(exemplar, path, mode, cache_metadata)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/store/tree_store.py", line 192, in _construct_builder_tree
return jtu.tree_map_with_path(open_builder, exemplar, is_leaf=heuristic_is_leaf)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/.venv/lib/python3.11/site-packages/jax/_src/tree_util.py", line 1166, in tree_map_with_path
return treedef.unflatten(f(*xs) for xs in zip(*all_keypath_leaves))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/.venv/lib/python3.11/site-packages/jax/_src/tree_util.py", line 1166, in <genexpr>
return treedef.unflatten(f(*xs) for xs in zip(*all_keypath_leaves))
^^^^^^
File "/content/levanter/src/levanter/store/tree_store.py", line 184, in open_builder
return JaggedArrayStore.open(
^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/store/jagged_array.py", line 133, in open
offsets = _ts_open_sync(offset_path, jnp.int64, [1], mode=mode, cache_settings=cache_settings)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/content/levanter/src/levanter/store/jagged_array.py", line 568, in _ts_open_sync
raise FileNotFoundError(f"File not found: {path}") from e
FileNotFoundError: File not found: cache/validation/input_ids/offsets
2025-08-08T01:22:57 - 0 - levanter.distributed - distributed.py:305 - INFO :: Shutting down ray...
Metadata
Metadata
Assignees
Labels
levanterIssues related to Levanter libraryIssues related to Levanter library