Skip to content

Commit cd557ce

Browse files
Merge branch 'main' into fix-gpt-oss-tool-parser-v2
2 parents b792948 + 5a48025 commit cd557ce

File tree

7 files changed

+38
-30
lines changed

7 files changed

+38
-30
lines changed

tests/engine/test_arg_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ def test_prefix_cache_default():
279279
args = parser.parse_args([])
280280

281281
engine_args = EngineArgs.from_cli_args(args=args)
282-
assert not engine_args.enable_prefix_caching, "prefix caching defaults to off."
282+
assert engine_args.enable_prefix_caching, "prefix caching should default to on."
283283

284284
# with flag to turn it on.
285285
args = parser.parse_args(["--enable-prefix-caching"])

tests/v1/core/test_scheduler.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,11 @@ def test_get_num_unfinished_requests():
7676
@pytest.mark.parametrize(
7777
"enable_prefix_caching, prompt_logprobs",
7878
[
79-
(None, None),
79+
(False, None),
8080
(True, 5),
8181
],
8282
)
83-
def test_schedule(enable_prefix_caching: bool | None, prompt_logprobs: int | None):
83+
def test_schedule(enable_prefix_caching: bool, prompt_logprobs: int | None):
8484
"""Test scheduling.
8585
Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
8686
"""
@@ -582,12 +582,12 @@ def test_check_stop_min_tokens():
582582
@pytest.mark.parametrize(
583583
"enable_prefix_caching, prompt_logprobs",
584584
[
585-
(None, None),
585+
(False, None),
586586
(True, 5),
587587
],
588588
)
589589
def test_schedule_concurrent_batches(
590-
enable_prefix_caching: bool | None, prompt_logprobs: int | None
590+
enable_prefix_caching: bool, prompt_logprobs: int | None
591591
):
592592
scheduler = create_scheduler(
593593
max_num_batched_tokens=1024,
@@ -1425,7 +1425,7 @@ def create_scheduler_with_priority(
14251425
model: str = "facebook/opt-125m",
14261426
max_num_seqs: int = 16,
14271427
max_num_batched_tokens: int = 8192,
1428-
enable_prefix_caching: bool | None = None,
1428+
enable_prefix_caching: bool = False,
14291429
long_prefill_token_threshold: int = 0,
14301430
disable_chunked_mm_input: bool = False,
14311431
use_kv_connector: bool = False,
@@ -1444,7 +1444,7 @@ def create_scheduler_with_priority(
14441444
max_num_batch_tokens: max num tokens to batch
14451445
enable_prefix_caching: optionally force APC config
14461446
(True/False) or use default
1447-
(None)
1447+
(False)
14481448
14491449
Returns:
14501450
{class}`Scheduler` instance with priority scheduling
@@ -1467,17 +1467,12 @@ def create_scheduler_with_priority(
14671467
seed=42,
14681468
)
14691469
# Cache config, optionally force APC
1470-
kwargs_cache = (
1471-
{}
1472-
if enable_prefix_caching is None
1473-
else {"enable_prefix_caching": enable_prefix_caching}
1474-
)
14751470
cache_config = CacheConfig(
14761471
block_size=block_size,
14771472
gpu_memory_utilization=0.9,
14781473
swap_space=0,
14791474
cache_dtype="auto",
1480-
**kwargs_cache,
1475+
enable_prefix_caching=enable_prefix_caching,
14811476
)
14821477
kv_transfer_config = (
14831478
KVTransferConfig(

tests/v1/core/utils.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def create_scheduler(
4242
model: str = "facebook/opt-125m",
4343
max_num_seqs: int = 16,
4444
max_num_batched_tokens: int = 8192,
45-
enable_prefix_caching: bool | None = None,
45+
enable_prefix_caching: bool = False,
4646
long_prefill_token_threshold: int = 0,
4747
disable_chunked_mm_input: bool = False,
4848
use_kv_connector: None | bool | MockKVConfig = None,
@@ -63,7 +63,7 @@ def create_scheduler(
6363
max_num_batch_tokens: max num tokens to batch
6464
enable_prefix_caching: optionally force APC config
6565
(True/False) or use default
66-
(None)
66+
(False)
6767
6868
Returns:
6969
{class}`Scheduler` instance
@@ -87,17 +87,12 @@ def create_scheduler(
8787
skip_tokenizer_init=skip_tokenizer_init,
8888
)
8989
# Cache config, optionally force APC
90-
kwargs_cache = (
91-
{}
92-
if enable_prefix_caching is None
93-
else {"enable_prefix_caching": enable_prefix_caching}
94-
)
9590
cache_config = CacheConfig(
9691
block_size=block_size,
9792
gpu_memory_utilization=0.9,
9893
swap_space=0,
9994
cache_dtype="auto",
100-
**kwargs_cache,
95+
enable_prefix_caching=enable_prefix_caching,
10196
)
10297
kv_transfer_config = None
10398
if isinstance(use_kv_connector, MockKVConfig):

tests/v1/kv_offload/test_cpu_offloading.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,14 @@
1212
from vllm import LLM, SamplingParams, TokensPrompt
1313
from vllm.config import KVEventsConfig, KVTransferConfig
1414
from vllm.distributed.kv_events import BlockStored, KVEventBatch
15+
from vllm.platforms import current_platform
1516
from vllm.utils.system_utils import set_env_var
1617

1718
CPU_BLOCK_SIZES = [48]
18-
ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER"]
19+
ATTN_BACKENDS = ["FLASH_ATTN"]
20+
21+
if current_platform.is_cuda():
22+
ATTN_BACKENDS.append("FLASHINFER")
1923

2024

2125
class MockSubscriber:

vllm/config/cache.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ class CacheConfig:
7373
sliding_window: int | None = None
7474
"""Sliding window size for the KV cache. This is primarily set in
7575
`ModelConfig` and that value should be manually duplicated here."""
76-
enable_prefix_caching: bool | None = None
77-
"""Whether to enable prefix caching. Enabled by default for V1."""
76+
enable_prefix_caching: bool = True
77+
"""Whether to enable prefix caching."""
7878
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
7979
"""Set the hash algorithm for prefix caching:\n
8080
- "sha256" uses Pickle for object serialization before hashing.\n

vllm/engine/arg_utils.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,7 @@ class EngineArgs:
425425
ParallelConfig.max_parallel_loading_workers
426426
)
427427
block_size: BlockSize | None = CacheConfig.block_size
428-
enable_prefix_caching: bool | None = CacheConfig.enable_prefix_caching
428+
enable_prefix_caching: bool | None = None
429429
prefix_caching_hash_algo: PrefixCachingHashAlgo = (
430430
CacheConfig.prefix_caching_hash_algo
431431
)
@@ -1975,10 +1975,11 @@ def _set_default_args(
19751975
if self.prefill_context_parallel_size > 1:
19761976
default_chunked_prefill = False
19771977
default_prefix_caching = False
1978-
logger.warning(
1978+
logger.warning_once(
19791979
"--prefill-context-parallel-size > 1 is not compatible with "
19801980
"chunked prefill and prefix caching now. Chunked prefill "
1981-
"and prefix caching have been disabled by default."
1981+
"and prefix caching have been disabled by default.",
1982+
scope="local",
19821983
)
19831984

19841985
if self.enable_chunked_prefill is None:
@@ -1988,15 +1989,27 @@ def _set_default_args(
19881989
"%s chunked prefill by default",
19891990
"Enabling" if default_chunked_prefill else "Disabling",
19901991
)
1992+
elif (
1993+
model_config.runner_type == "generate"
1994+
and not self.enable_chunked_prefill
1995+
and default_chunked_prefill
1996+
):
1997+
logger.warning_once(
1998+
"This model does not officially support disabling chunked prefill. "
1999+
"Disabling this manually may cause the engine to crash "
2000+
"or produce incorrect outputs.",
2001+
scope="local",
2002+
)
19912003
elif (
19922004
model_config.runner_type == "pooling"
19932005
and self.enable_chunked_prefill
19942006
and not default_chunked_prefill
19952007
):
1996-
logger.warning(
2008+
logger.warning_once(
19972009
"This model does not officially support chunked prefill. "
19982010
"Enabling this manually may cause the engine to crash "
19992011
"or produce incorrect outputs.",
2012+
scope="local",
20002013
)
20012014

20022015
if self.enable_prefix_caching is None:
@@ -2011,10 +2024,11 @@ def _set_default_args(
20112024
and self.enable_prefix_caching
20122025
and not default_prefix_caching
20132026
):
2014-
logger.warning(
2027+
logger.warning_once(
20152028
"This model does not officially support prefix caching. "
20162029
"Enabling this manually may cause the engine to crash "
20172030
"or produce incorrect outputs.",
2031+
scope="local",
20182032
)
20192033

20202034
world_size = self.pipeline_parallel_size * self.tensor_parallel_size

vllm/v1/core/sched/scheduler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def __init__(
180180
self.kv_cache_manager = KVCacheManager(
181181
kv_cache_config=kv_cache_config,
182182
max_model_len=self.max_model_len,
183-
enable_caching=bool(self.cache_config.enable_prefix_caching),
183+
enable_caching=self.cache_config.enable_prefix_caching,
184184
use_eagle=self.use_eagle,
185185
log_stats=self.log_stats,
186186
enable_kv_cache_events=self.enable_kv_cache_events,

0 commit comments

Comments
 (0)