@@ -73,7 +73,7 @@ def test_without_spec_decoding(
7373 run_tests (monkeypatch , MODEL , test_configs , test_sampling_params )
7474
7575
76- def test_with_spec_decoding (monkeypatch : pytest .MonkeyPatch ):
76+ def test_with_eagle3_spec_decoding (monkeypatch : pytest .MonkeyPatch ):
7777 """Test consistency and acceptance rates with some different combos of
7878 preemption, executor, async scheduling, prefill chunking,
7979 spec decoding model length.
@@ -106,6 +106,42 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
106106 run_tests (monkeypatch , MTP_MODEL , test_configs , [{}])
107107
108108
109+ def test_with_ngram_gpu_spec_decoding (monkeypatch : pytest .MonkeyPatch ):
110+ """Test ngram_gpu speculative decoding with different configurations.
111+
112+ This test specifically validates ngram_gpu behavior with various:
113+ - Number of speculative tokens (2-6)
114+ - Prompt lookup window sizes (min/max)
115+ - Async scheduling enabled (as in production)
116+ - Different executors and chunking settings
117+ """
118+
119+ # Variant with larger speculation window
120+ ngram_gpu_config = {
121+ "method" : "ngram_gpu" ,
122+ "num_speculative_tokens" : 3 ,
123+ "prompt_lookup_max" : 3 ,
124+ "prompt_lookup_min" : 2 ,
125+ }
126+
127+ # Test configurations covering various scenarios
128+ # test_preemption, executor, async_scheduling,
129+ # spec_config, test_prefill_chunking
130+ test_configs = [
131+ (False , "mp" , False , None , False ),
132+ (False , "mp" , False , ngram_gpu_config , False ),
133+ (True , "mp" , False , ngram_gpu_config , True ),
134+ (False , "mp" , True , ngram_gpu_config , False ),
135+ (True , "mp" , True , ngram_gpu_config , False ),
136+ (True , "uni" , True , ngram_gpu_config , False ),
137+ (True , "mp" , True , ngram_gpu_config , True ),
138+ ]
139+
140+ # Use MODEL (Qwen) for ngram_gpu tests as it's lighter weight
141+ # and ngram_gpu doesn't require a specific draft model
142+ run_tests (monkeypatch , MODEL , test_configs , [{}])
143+
144+
109145@dynamo_config .patch (cache_size_limit = 16 )
110146def run_tests (
111147 monkeypatch : pytest .MonkeyPatch ,
@@ -217,18 +253,19 @@ def run_test(
217253 else dict (gpu_memory_utilization = 0.9 )
218254 )
219255 spec_mml = (spec_config or {}).get ("max_model_len" )
256+ spec_method = (spec_config or {}).get ("method" , "none" )
220257 test_config = (
221258 f"executor={ executor } , preemption={ test_preemption } , "
222259 f"async_sched={ async_scheduling } , "
223260 f"chunk_prefill={ test_prefill_chunking } , "
224- f"spec_decoding={ spec_decoding } , spec_mml={ spec_mml } "
261+ f"spec_decoding={ spec_decoding } , spec_method= { spec_method } , spec_mml={ spec_mml } "
225262 )
226263 print ("-" * 80 )
227264 print (f"---- TESTING { test_str } : { test_config } " )
228265 print ("-" * 80 )
229266 with VllmRunner (
230267 model ,
231- max_model_len = 512 ,
268+ max_model_len = 4096 ,
232269 enable_chunked_prefill = test_prefill_chunking ,
233270 # Force prefill chunking
234271 max_num_batched_tokens = 48 if test_prefill_chunking else None ,
0 commit comments