marin-community
diff --git a/‎src/levanter/data/packing.py‎
Lines changed: 133 additions & 41 deletions b/‎src/levanter/data/packing.py‎
Lines changed: 133 additions & 41 deletions
@@ -7,6 +7,7 @@
 
 This achieves about a 90% "real token" rate, compared to like 10% without packing.
 """
+
 import asyncio
 from dataclasses import dataclass
 from typing import Iterable, Iterator, Literal, Optional, Sequence, TypeVar
@@ -62,9 +63,17 @@ def __init__(self, Pos: hax.Axis, max_pack_size: int, pad_token: int):
         assert pad_token is not None, "pad_token must be set"
 
     def can_pack(self, ids: list[int]) -> bool:
-        return len(ids) + len(self._ids) <= self.Pos.size and self.num_segments < self.max_pack_size
+        return (
+            len(ids) + len(self._ids) <= self.Pos.size
+            and self.num_segments < self.max_pack_size
+        )
 
-    def add_example(self, ids: list[int], loss_mask: list[int] | np.ndarray, segment_id: int | None = None):
+    def add_example(
+        self,
+        ids: list[int],
+        loss_mask: list[int] | np.ndarray,
+        segment_id: int | None = None,
+    ):
         if len(ids) != len(loss_mask):
             raise ValueError("ids and loss_mask must have the same length")
 
@@ -90,7 +99,9 @@ def add_example(self, ids: list[int], loss_mask: list[int] | np.ndarray, segment
     def pack(self) -> LmExample:
         ids = self._ids + [self.pad_token] * (self.Pos.size - len(self._ids))
 
-        segment_ids = self._segment_ids + [-1] * (self.Pos.size - len(self._segment_ids))
+        segment_ids = self._segment_ids + [-1] * (
+            self.Pos.size - len(self._segment_ids)
+        )
 
         loss_mask = self._loss_mask + [0] * (self.Pos.size - len(self._loss_mask))
 
@@ -173,7 +184,9 @@ def per_segment_loss(
     This code is designed to run in a jit-compiled function, meaning we have to careful of shapes
     """
 
-    assert packed_example.attn_mask.segment_ids is not None, "segment_ids must be set in the AttentionMask"
+    assert (
+        packed_example.attn_mask.segment_ids is not None
+    ), "segment_ids must be set in the AttentionMask"
 
     segment_ids = packed_example.attn_mask.segment_ids
     assert (
@@ -200,7 +213,9 @@ def per_segment_loss(
 def _unique_segment_ids(max_Segments, segment_ids):
     # Extract unique segment IDs with padding
     # TODO: add unique to haliax
-    unique_segment_ids = jnp.unique(segment_ids.array, size=max_Segments.size, fill_value=-1)
+    unique_segment_ids = jnp.unique(
+        segment_ids.array, size=max_Segments.size, fill_value=-1
+    )
     unique_segment_ids = hax.named(unique_segment_ids, max_Segments)
     return unique_segment_ids
 
@@ -219,7 +234,9 @@ def per_segment_correct(
     correct is a boolean array of the same shape as the losses array indicating whether the token was correct
     """
 
-    assert packed_example.attn_mask.segment_ids is not None, "segment_ids must be set in the AttentionMask"
+    assert (
+        packed_example.attn_mask.segment_ids is not None
+    ), "segment_ids must be set in the AttentionMask"
 
     segment_ids = packed_example.attn_mask.segment_ids
     assert (
@@ -250,6 +267,8 @@ def greedy_pack_prompt_completions(
     sequences: Iterable[PromptCompletion],
     pad_token: int,
     max_segments_per_example: int = 64,
+    pad_start: int = 0,
+    lengths: np.ndarray | None = None,
 ) -> list[LmExample]:
     """
     Greedy packing of prompt completions into LmExamples using [pack_documents][]
@@ -265,8 +284,12 @@ def make_loss_mask(id, prompt_length):
     ids = [sequence.ids for sequence in sequences]
 
     # Pack documents based on their lengths
+    pack_lengths = (
+        np.array([len(token_ids) for token_ids in ids]) if lengths is None else lengths
+    )
+    pack_lengths = pack_lengths + pad_start
     packs = pack_documents(
-        lengths=np.array([len(token_ids) for token_ids in ids]),
+        lengths=pack_lengths,
         max_length=Pos.size,
         max_segments_per_example=max_segments_per_example,
         slice_too_long_examples=True,
@@ -285,10 +308,21 @@ def make_loss_mask(id, prompt_length):
         concat_loss_mask = []
         segment_ids = []
 
-        for doc_id, seq, prompt_len in zip(docs_in_pack, pack_sequences, pack_prompt_lengths):
+        for doc_id, seq, prompt_len in zip(
+            docs_in_pack, pack_sequences, pack_prompt_lengths
+        ):
+            doc_length = len(seq.ids)
+            pad_end = pack_lengths[doc_id] - pad_start - doc_length
+
             concat_ids.extend(seq.ids)
+
+            concat_loss_mask.extend([0] * pad_start)
             concat_loss_mask.extend(make_loss_mask(seq.ids, prompt_len))
-            segment_ids.extend([doc_id] * len(seq.ids))
+            concat_loss_mask.extend([0] * pad_end)
+
+            segment_ids.extend([-1] * pad_start)
+            segment_ids.extend([doc_id] * pack_lengths[doc_id])
+            segment_ids.extend([-1] * pad_end)
 
         # Pad to max length
         pad_length = Pos.size - len(concat_ids)
@@ -300,7 +334,9 @@ def make_loss_mask(id, prompt_length):
         elif pad_length < 0:
             # too long, this should only happen if there's 1 document in the pack
             if len(pack_sequences) != 1:
-                raise ValueError("Too many tokens in a pack with more than one document")
+                raise ValueError(
+                    "Too many tokens in a pack with more than one document"
+                )
             concat_ids = concat_ids[-Pos.size :]
             concat_loss_mask = concat_loss_mask[-Pos.size :]
             segment_ids = segment_ids[-Pos.size :]
@@ -326,6 +362,7 @@ def _segment_ids_from_lengths(doc_ids: list[int], lengths: list[int]) -> list[in
 def pack_documents(
     lengths: PyTree[np.ndarray],
     max_length: PyTree[int],
+    pad_start: PyTree[int],
     max_segments_per_example: int | None = None,
     slice_too_long_examples: bool = False,
 ) -> list[range]:
@@ -347,17 +384,24 @@ def pack_documents(
     if max_segments_per_example is not None and (
         not isinstance(max_segments_per_example, int) or max_segments_per_example <= 0
     ):
-        raise ValueError(f"max_segments_per_example must be a positive integer, got {max_segments_per_example}")
+        raise ValueError(
+            f"max_segments_per_example must be a positive integer, got {max_segments_per_example}"
+        )
+
+    lengths_leaves = jax.tree.leaves(lengths)
+    leaf_names = jax.tree.leaves(leaf_key_paths(lengths))
 
     # Broadcast max_length to match the structure of lengths
     max_length_tree = tree_broadcast_to(max_length, lengths)
-
-    lengths_leaves = jax.tree.leaves(lengths)
     max_length_leaves = jax.tree.leaves(max_length_tree)
-    leaf_names = jax.tree.leaves(leaf_key_paths(lengths))
+
+    pad_start_tree = tree_broadcast_to(pad_start, lengths)
+    pad_start_leaves = jax.tree.leaves(pad_start_tree)
 
     if len(lengths_leaves) != len(max_length_leaves):
-        raise ValueError("Lengths and max_length PyTrees must have the same number of leaves.")
+        raise ValueError(
+            "Lengths and max_length PyTrees must have the same number of leaves."
+        )
 
     # Check that all leaves have the same number of documents.
     n_docs = None
@@ -370,12 +414,14 @@ def pack_documents(
     if n_docs is None:
         raise ValueError("Could not determine the number of documents from lengths.")
 
-    # Validate document lengths
+    # Validate document lengths (including pad_start)
     for lens, allowed, leaf_name in zip(lengths_leaves, max_length_leaves, leaf_names):
         for i in range(n_docs):
-            if lens[i] > allowed and not slice_too_long_examples:
+            effective_length = lens[i] + pad_start
+            if effective_length > allowed and not slice_too_long_examples:
                 raise ValueError(
-                    f"Document {i} in leaf '{leaf_name}' has length {lens[i]} which exceeds "
+                    f"Document {i} in leaf '{leaf_name}' has effective length {effective_length} "
+                    f"(document length {lens[i]} + pad_start {pad_start}) which exceeds "
                     f"maximum allowed length {allowed}. Consider setting slice_too_long_examples=True "
                     "or increasing max_length."
                 )
@@ -388,19 +434,27 @@ def pack_documents(
         # Accumulate documents while for each leaf the token span remains within the allowed max.
         while i < n_docs:
             # Check optional segment constraint: if adding one more document would exceed max_segments_per_example.
-            if max_segments_per_example is not None and (total_segments + 1) > max_segments_per_example:
+            if (
+                max_segments_per_example is not None
+                and (total_segments + 1) > max_segments_per_example
+            ):
                 break
             # For each leaf, check if adding document i would keep the token count within allowed capacity.
             valid = True
-            for lens, allowed, leaf_name in zip(lengths_leaves, max_length_leaves, leaf_names, strict=True):
-                # Compute token count from document start to document i+1.
-                token_sum = sum(lens[start : i + 1])
+            for lens, allowed, leaf_name in zip(
+                lengths_leaves, max_length_leaves, leaf_names, strict=True
+            ):
+                # Compute token count from document start to document i+1, including pad_start for each doc.
+                num_docs_in_pack = i - start + 1
+                token_sum = sum(lens[start : i + 1]) + num_docs_in_pack * pad_start
                 if token_sum > allowed:
                     valid = False
                     if not slice_too_long_examples and i == start:
                         # If this is the first document in a new pack and it's too long, raise an error
+                        effective_length = lens[i] + pad_start
                         raise ValueError(
-                            f"Document {i} in leaf '{leaf_name}' has length {lens[i]} which exceeds "
+                            f"Document {i} in leaf '{leaf_name}' has effective length {effective_length} "
+                            f"(document length {lens[i]} + pad_start {pad_start}) which exceeds "
                             f"maximum allowed length {allowed}. Consider setting slice_too_long_examples=True "
                             "or increasing max_length."
                         )
@@ -445,6 +499,8 @@ def __init__(
         max_segments_per_example: int | None = None,
         pad_with_zeros: bool = True,
         slice_strategy: Literal["left", "right", "raise"] = "raise",
+        lengths: np.ndarray | None = None,
+        prefixes: PyTree[np.ndarray] | None = None,
     ):
         """
         Args:
@@ -457,29 +513,42 @@ def __init__(
         super().__init__()
 
         if slice_strategy not in ["left", "right", "raise"]:
-            raise ValueError(f"slice_strategy must be one of 'left', 'right', or 'raise', got {slice_strategy}")
+            raise ValueError(
+                f"slice_strategy must be one of 'left', 'right', or 'raise', got {slice_strategy}"
+            )
 
         self.dataset = dataset
         self.max_length = max_length
         self.max_segments_per_example = max_segments_per_example
         self.pad_with_zeros = pad_with_zeros
         self.slice_strategy = slice_strategy
 
-        _offsets = jax.tree.map(lambda store: store.offsets[0 : store.num_rows + 1].read(), self.dataset)
+        _offsets = jax.tree.map(
+            lambda store: store.offsets[0 : store.num_rows + 1].read(), self.dataset
+        )
         self._offsets = jax.tree.map(lambda fut: fut.result(), _offsets)
 
-        def diff_offsets(offsets: np.ndarray):
-            # fine to mutate since we have a copy
-            # the array store has the number of rows in the 0th offset
-            offsets[0] = 0
-            return offsets[1:] - offsets[:-1]
 
-        # Convert offsets to lengths
-        self._lengths = jax.tree.map(diff_offsets, self._offsets)
+        if lengths is not None:
+            self._lengths = lengths
+        else:
+            def diff_offsets(offsets: np.ndarray):
+                # fine to mutate since we have a copy
+                # the array store has the number of rows in the 0th offset
+                offsets[0] = 0
+                return offsets[1:] - offsets[:-1]
+
+            # Convert offsets to lengths
+            self._lengths = jax.tree.map(diff_offsets, self._offsets)
+
 
         # Build pack indices
         self._pack_indices: list[range] = pack_documents(
-            self._lengths, max_length, max_segments_per_example, slice_strategy != "raise"
+            self._lengths,
+            max_length,
+            pad_start,
+            max_segments_per_example,
+            slice_strategy != "raise",
         )
 
     def is_finite(self) -> bool:
@@ -494,7 +563,9 @@ async def final_length_is_known(self) -> bool:
     async def current_len(self) -> Optional[int]:
         return len(self._pack_indices)
 
-    async def get_batch(self, indices: Sequence[int]) -> Sequence[tuple[PyTree[np.ndarray], PyTree[np.ndarray]]]:
+    async def get_batch(
+        self, indices: Sequence[int]
+    ) -> Sequence[tuple[PyTree[np.ndarray], PyTree[np.ndarray]]]:
         """
         For each requested packed example (by index into self._pack_indices), reconstruct the
         token data on the fly from the underlying dataset. In our packing scheme the pack holds, for each leaf,
@@ -508,7 +579,9 @@ async def get_batch(self, indices: Sequence[int]) -> Sequence[tuple[PyTree[np.nd
 
         pack_doc_ranges = [self._pack_indices[i] for i in indices]
 
-        async def get_data_for_leaf(store, offsets, allowed: int) -> tuple[list[np.ndarray], list[np.ndarray]]:
+        async def get_data_for_leaf(
+            store, offsets, allowed: int
+        ) -> tuple[list[np.ndarray], list[np.ndarray]]:
             out_data = []
             out_segment_ids = []
             # Using ts.Batch to group reads.
@@ -520,7 +593,9 @@ async def get_data_for_leaf(store, offsets, allowed: int) -> tuple[list[np.ndarr
                     token_count = token_end - token_start
                     if token_count > allowed:
                         if self.slice_strategy != "raise":
-                            assert len(dr) == 1, "We shouldn't have packed two examples together if one is too long."
+                            assert (
+                                len(dr) == 1
+                            ), "We shouldn't have packed two examples together if one is too long."
                             if self.slice_strategy == "right":
                                 # slice from the right
                                 token_start = token_end - allowed
@@ -533,12 +608,20 @@ async def get_data_for_leaf(store, offsets, allowed: int) -> tuple[list[np.ndarr
                                 f"{list(dr)}. Consider using a different slice_strategy or increasing max_length."
                             )
                     # Read the slice from the underlying data.
+                    # TODO need to pad at start with token_idx and and at end up to length here
+                    # the size of each example will differ, but the that way the output size of denoi
+                    # or... maybe we identify the starts of segments using seg_ids (and pad this tensor?)
+                    # then we roll & call on each of the rolled parts
+                    # so this doesn't need to start with token_idx
+                    # and we don't need pad_start
                     out_data.append(store.data[token_start:token_end].read())
 
                     # Create segment IDs for this pack
                     segment_ids = []
                     for doc_idx in range(len(dr)):
-                        doc_start = offsets[dr.start + doc_idx] if dr.start + doc_idx > 0 else 0
+                        doc_start = (
+                            offsets[dr.start + doc_idx] if dr.start + doc_idx > 0 else 0
+                        )
                         doc_end = offsets[dr.start + doc_idx + 1]
                         doc_length = doc_end - doc_start
                         # Use the global document index as the segment ID
@@ -555,7 +638,10 @@ async def get_data_for_leaf(store, offsets, allowed: int) -> tuple[list[np.ndarr
 
             if self.pad_with_zeros:
                 out_data = [np.pad(x, (0, allowed - x.shape[0])) for x in out_data]
-                out_segment_ids = [np.pad(x, (0, allowed - x.shape[0]), constant_values=-1) for x in out_segment_ids]
+                out_segment_ids = [
+                    np.pad(x, (0, allowed - x.shape[0]), constant_values=-1)
+                    for x in out_segment_ids
+                ]
 
             return out_data, out_segment_ids
 
@@ -568,7 +654,9 @@ async def get_data_for_leaf(store, offsets, allowed: int) -> tuple[list[np.ndarr
         # Use tree.map to combine the leaves from: dataset, max_length and, for each pack, its doc_range.
         # Note: jax.tree.map will map over each pack in parallel across the leaves.
         max_length_tree = tree_broadcast_to(self.max_length, self._offsets)
-        leaf_batch_futures = jax.tree.map(get_data_for_leaf, self.dataset, self._offsets, max_length_tree)
+        leaf_batch_futures = jax.tree.map(
+            get_data_for_leaf, self.dataset, self._offsets, max_length_tree
+        )
 
         # Flatten the resulting PyTree: each leaf is now an Awaitable returning a tuple of lists of np.ndarray—one per requested pack.
         leaves, treedef = jax.tree.flatten(leaf_batch_futures)
@@ -582,7 +670,9 @@ async def get_data_for_leaf(store, offsets, allowed: int) -> tuple[list[np.ndarr
         results = []
         for i in range(len(indices)):
             data = jax.tree.unflatten(treedef, [leaf[0][i] for leaf in resolved_leaves])
-            segment_ids = jax.tree.unflatten(treedef, [leaf[1][i] for leaf in resolved_leaves])
+            segment_ids = jax.tree.unflatten(
+                treedef, [leaf[1][i] for leaf in resolved_leaves]
+            )
             results.append((data, segment_ids))
         return results
 
@@ -598,7 +688,9 @@ async def get_data_for_leaf(store, offsets, allowed: int) -> tuple[list[np.ndarr
     store = JaggedArrayStore.open(path, mode="r", dtype=np.uint32, cache_metadata=True)
 
     time_in = time.time()
-    packed = GreedyPrepackedDataset(store, max_length=4096, pad_with_zeros=True, slice_strategy="right")
+    packed = GreedyPrepackedDataset(
+        store, max_length=4096, pad_with_zeros=True, slice_strategy="right"
+    )
     time_out = time.time()
     print(f"Took {time_out - time_in:.2f}s to build pack")