Add a level 3 sleep/wake_up that offloads tensors to disk

manoelmarques · aavarghese · manoelmarques · commit a82562e29751 · 2025-10-09T09:12:52.000-04:00
Co-authored-by: aavarghese &lt;avarghese@us.ibm.com&gt;
Co-authored-by: manoelmarques &lt;manoel.marques@ibm.com&gt;
Signed-off-by: Manoel Marques &lt;manoel.marques@ibm.com&gt;
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
@@ -171,6 +171,30 @@ def test_end_to_end(model: str):
     # cmp output
     assert output[0].outputs[0].text == output3[0].outputs[0].text
 
+        # test sleep level 3 here.
+        llm.sleep(level=3)
+
+        free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+        # now the memory usage is mostly cudagraph memory pool,
+        # and it should be less than the model weights (1B model, 2GiB weights)
+
+        # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+        # is captured but cannot be releasesd from PyTorch due to a known bug,
+        # therefore high memory usage after `llm.sleep` is called is expected.
+        # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+        # in V1.
+        if use_v1:
+            assert used_bytes < 7 * GiB_bytes
+        else:
+            assert used_bytes < 2 * GiB_bytes
+
+        llm.wake_up()
+        output2 = llm.generate(prompt, sampling_params)
+
+        # cmp output
+        assert output[0].outputs[0].text == output2[0].outputs[0].text
+
 
 @create_new_process_for_each_test()
 def test_deep_sleep():
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
@@ -59,3 +59,15 @@ def test_sleep_mode():
         response = requests.get(remote_server.url_for("is_sleeping"))
         assert response.status_code == 200
         assert response.json().get("is_sleeping") is False
+
+        response = requests.post(remote_server.url_for("/sleep"), data={"level": "3"})
+        assert response.status_code == 200
+        response = requests.get(remote_server.url_for("/is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is True
+
+        response = requests.post(remote_server.url_for("/wake_up"))
+        assert response.status_code == 200
+        response = requests.get(remote_server.url_for("/is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is False
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
@@ -8,20 +8,68 @@
 # both of them failed because of cuda context mismatch.
 # not sure why, they are created from a different context.
 # the only successful approach is to call cuda driver API in C.
+import contextlib
+import ctypes
 import dataclasses
 import gc
+import io
+import mmap
 import os
+import struct
+import uuid
 from contextlib import contextmanager
 from typing import Any, Callable, Optional, Union
 
 import torch
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
 
 
+def _copy_from_cuda_to_bytes(scr_ptr: int, size_in_bytes: int) -> bytes:
+    dest_ptr = ctypes.create_string_buffer(size_in_bytes)
+    libcudart.cudaMemcpy(dest_ptr, scr_ptr, size_in_bytes)
+    return bytes(dest_ptr)
+
+
+def _copy_from_bytes_to_cuda(dest_ptr: int, data: bytes) -> None:
+    # reserve space for 0 termination
+    scr_ptr = ctypes.create_string_buffer(data, len(data))
+    libcudart.cudaMemcpy(dest_ptr, scr_ptr, len(data))
+
+
+def _write_bytes(data: bytes, binary_file: io.BufferedWriter) -> None:
+    # Pack the length as a 4-byte unsigned integer (little-endian)
+    data_len = len(data)
+    header = struct.pack("<I", data_len)
+    binary_file.write(header)
+    if data_len > 0:
+        binary_file.write(data)
+
+
+def _read_bytes(mmap_obj: mmap.mmap) -> bytes:
+    header = mmap_obj.read(4)
+    if not header:
+        raise ValueError("Missing header read")
+
+    if len(header) != 4:
+        raise ValueError("Incomplete header read")
+
+    data_len = struct.unpack("<I", header)[0]
+    if data_len == 0:
+        return b""
+
+    data = mmap_obj.read(data_len)
+
+    if len(data) != data_len:
+        raise ValueError("Incomplete data read")
+
+    return data
+
+
 def find_loaded_library(lib_name) -> Optional[str]:
     """
     According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
@@ -166,6 +214,8 @@ def __init__(self):
         self.python_malloc_callback = self._python_malloc_callback
         self.python_free_callback = self._python_free_callback
 
+        self.cache_filepath = ""
+
     def _python_malloc_callback(self, allocation_handle: HandleType) -> None:
         """
         Internal method to store the allocation data
@@ -197,7 +247,27 @@ def _python_free_callback(self, ptr: int) -> HandleType:
         )
         return data.handle
 
-    def sleep(self, offload_tags: Optional[Union[tuple[str, ...], str]] = None) -> None:
+    def _delete_cache_file(self):
+        """
+        Remove sleep cache file if it exists
+        """
+        if self.cache_filepath != "":
+            filepath = self.cache_filepath
+            self.cache_filepath = ""
+            try:
+                with contextlib.suppress(FileNotFoundError):
+                    os.remove(filepath)
+                    logger.info("cache file %s deleted", filepath)
+            except Exception as e:
+                logger.warning(
+                    "failed to delete sleep cache file %s.", filepath, exc_info=e
+                )
+
+    def sleep(
+        self,
+        level: Optional[int] = 1,
+        offload_tags: Optional[Union[tuple[str, ...], str]] = None,
+    ) -> None:
         """
         Put the allocator in sleep mode.
         All data in the memory allocation with the specified tag will be
@@ -218,6 +288,31 @@ def sleep(self, offload_tags: Optional[Union[tuple[str, ...], str]] = None) -> N
         total_bytes = 0
         backup_bytes = 0
 
+        # remove previous file if exists
+        self._delete_cache_file()
+
+        # level 3 write weights to file
+        if level == 3:
+            unique_id = uuid.uuid4().hex
+            self.cache_filepath = os.path.join(
+                envs.VLLM_CACHE_ROOT, f"sleep_cache_{unique_id}.bin"
+            )
+            logger.info(
+                "sleep level %d writing to cache file %s", level, self.cache_filepath
+            )
+            with open(self.cache_filepath, "wb") as binary_file:
+                for ptr, data in self.pointer_to_data.items():
+                    handle = data.handle
+                    if data.tag in offload_tags:
+                        size_in_bytes = handle[1]
+                        data = _copy_from_cuda_to_bytes(ptr, size_in_bytes)
+                        _write_bytes(data, binary_file)
+                    else:
+                        _write_bytes(b"", binary_file)
+                    unmap_and_release(handle)
+            return
+
+        # handle other levels
         for ptr, data in self.pointer_to_data.items():
             handle = data.handle
             total_bytes += handle[1]
@@ -257,6 +352,25 @@ def wake_up(self, tags: Optional[list[str]] = None) -> None:
             back to GPU memory. If None, all memory allocation will be loaded
             back to GPU memory.
         """
+        if self.cache_filepath != "":
+            logger.info("wake_up reading from cache file %s", self.cache_filepath)
+            with (
+                open(self.cache_filepath, "rb") as bin_file,
+                mmap.mmap(
+                    bin_file.fileno(), length=0, access=mmap.ACCESS_READ
+                ) as mmap_obj,
+            ):
+                for ptr, data in self.pointer_to_data.items():
+                    handle = data.handle
+                    create_and_map(handle)
+                    data = _read_bytes(mmap_obj)
+                    if len(data) > 0:
+                        _copy_from_bytes_to_cuda(ptr, data)
+
+            # remove file
+            self._delete_cache_file()
+            return
+
         for ptr, data in self.pointer_to_data.items():
             if tags is None or data.tag in tags:
                 handle = data.handle
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -1487,6 +1487,11 @@ def sleep(self, level: int = 1):
                 sleep is good for sleeping and waking up the engine to run a
                 different model or update the model, where previous model
                 weights are not needed. It reduces CPU memory pressure.
+                Level 3 sleep will offload the model weights to disk and
+                discard the kv cache. The model weights are not backed up in
+                CPU memory. The content of kv cache is forgotten. Level 3
+                sleep helps use minimum CPU memory and loads efficiently
+                from disk when woken up.
         """
         self.reset_prefix_cache()
         self.llm_engine.sleep(level=level)
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -120,7 +120,9 @@ def sleep(self, level: int = 1) -> None:
             }
 
         allocator = CuMemAllocator.get_instance()
-        allocator.sleep(offload_tags=("weights",) if level == 1 else tuple())
+        allocator.sleep(
+            level, offload_tags=("weights",) if level == 1 or level == 3 else tuple()
+        )
         free_bytes_after_sleep, total = torch.cuda.mem_get_info()
         freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
         used_bytes = total - free_bytes_after_sleep