Skip to content

Commit 3085478

Browse files
lim4349Cloud Usergemini-code-assist[bot]
authored
[Model] Add OpenCUA-7B support (#29068)
Signed-off-by: lim4349 <[email protected]> Signed-off-by: Zero <[email protected]> Co-authored-by: Cloud User <[email protected]> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 1073ba6 commit 3085478

File tree

4 files changed

+279
-0
lines changed

4 files changed

+279
-0
lines changed

docs/models/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
701701
| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
702702
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
703703
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
704+
| `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
704705
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
705706
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
706707
| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |

tests/models/registry.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,9 @@ def check_available_online(
725725
"NemotronH_Nano_VL_V2": _HfExamplesInfo(
726726
"nano_vl_dummy", is_available_online=False, trust_remote_code=True
727727
),
728+
"OpenCUAForConditionalGeneration": _HfExamplesInfo(
729+
"xlangai/OpenCUA-7B", trust_remote_code=True
730+
),
728731
"Ovis": _HfExamplesInfo(
729732
"AIDC-AI/Ovis2-1B",
730733
trust_remote_code=True,
Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
#
4+
# Adapted from Qwen2.5-VL implementation
5+
# Copyright 2025 The vLLM team.
6+
# Copyright 2025 XLANG Lab, The University of Hong Kong
7+
8+
"""Inference-only OpenCUA-7B model compatible with HuggingFace weights."""
9+
10+
from collections.abc import Mapping, Sequence
11+
from typing import Any
12+
13+
import torch
14+
import torch.nn as nn
15+
from transformers import BatchFeature
16+
from transformers.models.qwen2_vl import (
17+
Qwen2VLImageProcessor,
18+
Qwen2VLProcessor,
19+
Qwen2VLVideoProcessor,
20+
)
21+
22+
from vllm.config import VllmConfig
23+
from vllm.multimodal import MULTIMODAL_REGISTRY
24+
from vllm.multimodal.inputs import (
25+
MultiModalFieldConfig,
26+
MultiModalKwargs,
27+
)
28+
from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
29+
from vllm.multimodal.processing import (
30+
BaseMultiModalProcessor,
31+
PromptReplacement,
32+
PromptUpdate,
33+
)
34+
from vllm.transformers_utils.tokenizer import AnyTokenizer
35+
36+
from .qwen2_5_vl import (
37+
Qwen2_5_VisionTransformer as OpenCUAVisionTransformer,
38+
)
39+
from .qwen2_5_vl import (
40+
Qwen2_5_VLForConditionalGeneration,
41+
)
42+
from .qwen2_vl import (
43+
Qwen2VLDummyInputsBuilder,
44+
Qwen2VLMultiModalDataParser,
45+
Qwen2VLProcessingInfo,
46+
_create_qwen2vl_field_factory,
47+
)
48+
from .utils import (
49+
WeightsMapper,
50+
init_vllm_registered_model,
51+
maybe_prefix,
52+
)
53+
54+
55+
class OpenCUAProcessingInfo(Qwen2VLProcessingInfo):
56+
def get_hf_config(self):
57+
return self.ctx.get_hf_config()
58+
59+
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
60+
return {"image": None}
61+
62+
def get_hf_processor(self, **kwargs: object):
63+
"""Load OpenCUA processor."""
64+
tokenizer = self.get_tokenizer()
65+
vision_config = self.ctx.get_hf_image_processor_config()
66+
return OpenCUAProcessor(
67+
vision_config=vision_config,
68+
tokenizer=tokenizer,
69+
**kwargs,
70+
)
71+
72+
73+
class OpenCUAProcessor(Qwen2VLProcessor):
74+
def check_argument_for_proper_class(self, attribute_name: str, arg: object) -> None:
75+
if attribute_name == "tokenizer":
76+
return
77+
return super().check_argument_for_proper_class(attribute_name, arg)
78+
79+
def __init__(
80+
self,
81+
vision_config: dict,
82+
tokenizer: AnyTokenizer,
83+
**kwargs,
84+
):
85+
image_processor = Qwen2VLImageProcessor(**vision_config)
86+
video_processor = Qwen2VLVideoProcessor(**vision_config)
87+
chat_template = kwargs.pop("chat_template", None)
88+
89+
super().__init__(
90+
image_processor=image_processor,
91+
tokenizer=tokenizer,
92+
video_processor=video_processor,
93+
chat_template=chat_template,
94+
**kwargs,
95+
)
96+
97+
self.image_token = "<|media_placeholder|>"
98+
99+
def __call__(
100+
self,
101+
text=None,
102+
images=None,
103+
return_tensors=None,
104+
**kwargs,
105+
):
106+
if text is not None:
107+
if not isinstance(text, list):
108+
text = [text]
109+
text_inputs = self.tokenizer(text, **kwargs)
110+
else:
111+
text_inputs = {}
112+
113+
image_inputs = {}
114+
if images is not None:
115+
if not isinstance(images, list):
116+
images = [images]
117+
if len(images) > 0:
118+
image_inputs = self.image_processor(
119+
images, return_tensors=return_tensors or "pt"
120+
)
121+
122+
combined_inputs = {**text_inputs, **image_inputs}
123+
124+
return BatchFeature(combined_inputs, tensor_type=return_tensors)
125+
126+
127+
class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo]):
128+
def _get_data_parser(self) -> MultiModalDataParser:
129+
return Qwen2VLMultiModalDataParser(
130+
self.info.get_hf_config().vision_config.spatial_merge_size
131+
)
132+
133+
def _get_mm_fields_config(
134+
self,
135+
hf_inputs: BatchFeature,
136+
hf_processor_mm_kwargs: Mapping[str, object],
137+
) -> Mapping[str, MultiModalFieldConfig]:
138+
return _create_qwen2vl_field_factory(
139+
self.info.get_hf_config().vision_config.spatial_merge_size
140+
)(hf_inputs)
141+
142+
def _hf_processor_applies_updates(
143+
self,
144+
prompt_text: str,
145+
mm_items: MultiModalDataItems,
146+
hf_processor_mm_kwargs: Mapping[str, object],
147+
tokenization_kwargs: Mapping[str, object],
148+
) -> bool:
149+
"""vLLM이 prompt 업데이트를 처리하도록 False 반환."""
150+
return False
151+
152+
def _get_prompt_updates(
153+
self,
154+
mm_items: MultiModalDataItems,
155+
hf_processor_mm_kwargs: Mapping[str, Any],
156+
out_mm_kwargs: MultiModalKwargs,
157+
) -> Sequence[PromptUpdate]:
158+
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
159+
image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
160+
tokenizer = self.info.get_tokenizer()
161+
vocab = tokenizer.get_vocab()
162+
hf_config = self.info.get_hf_config()
163+
164+
image_token_str = getattr(hf_processor, "image_token", "<|media_placeholder|>")
165+
image_token_id = vocab.get(
166+
image_token_str,
167+
getattr(hf_config, "media_placeholder_token_id", 151664),
168+
)
169+
170+
merge_length = image_processor.merge_size**2
171+
172+
def get_replacement_opencua(item_idx: int):
173+
out_item = out_mm_kwargs["image"][item_idx]
174+
grid_thw = out_item["image_grid_thw"].data
175+
assert isinstance(grid_thw, torch.Tensor)
176+
177+
num_tokens = int(grid_thw.prod()) // merge_length
178+
return [image_token_id] * num_tokens
179+
180+
return [
181+
PromptReplacement(
182+
modality="image",
183+
target=[image_token_id],
184+
replacement=get_replacement_opencua,
185+
)
186+
]
187+
188+
189+
class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder):
190+
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
191+
num_images = mm_counts.get("image", 0)
192+
193+
image_token = "<|media_placeholder|>"
194+
195+
return image_token * num_images
196+
197+
198+
@MULTIMODAL_REGISTRY.register_processor(
199+
OpenCUAMultiModalProcessor,
200+
info=OpenCUAProcessingInfo,
201+
dummy_inputs=OpenCUADummyInputsBuilder,
202+
)
203+
class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
204+
merge_by_field_config = True
205+
multimodal_cpu_fields = {"image_grid_thw"}
206+
207+
packed_modules_mapping = {
208+
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
209+
"gate_up_proj": ["gate_proj", "up_proj"],
210+
}
211+
212+
hf_to_vllm_mapper = WeightsMapper(
213+
orig_to_new_prefix={
214+
"model.language_model.": "language_model.model.",
215+
"model.visual.": "visual.",
216+
"vision_tower.": "visual.",
217+
"lm_head.": "language_model.lm_head.",
218+
"model.": "language_model.model.",
219+
}
220+
)
221+
222+
supports_encoder_tp_data = True
223+
224+
@classmethod
225+
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
226+
if modality.startswith("image"):
227+
return "<|media_placeholder|>"
228+
raise ValueError("Only image modality is supported")
229+
230+
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
231+
nn.Module.__init__(self)
232+
config = vllm_config.model_config.hf_config
233+
quant_config = vllm_config.quant_config
234+
multimodal_config = vllm_config.model_config.multimodal_config
235+
236+
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
237+
self.config = config
238+
self.vllm_config = vllm_config
239+
self.multimodal_config = multimodal_config
240+
self.quant_config = quant_config
241+
self.is_multimodal_pruning_enabled = (
242+
multimodal_config.is_multimodal_pruning_enabled()
243+
)
244+
245+
if multimodal_config.get_limit_per_prompt("image"):
246+
attn_backend_override = (
247+
multimodal_config.mm_encoder_attn_backend
248+
if multimodal_config is not None
249+
else None
250+
)
251+
self.visual = OpenCUAVisionTransformer(
252+
vision_config=config.vision_config,
253+
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
254+
quant_config=self.quant_config,
255+
prefix=maybe_prefix(prefix, "visual"),
256+
use_data_parallel=self.use_data_parallel,
257+
attn_backend_override=attn_backend_override,
258+
)
259+
else:
260+
self.visual = None
261+
262+
self.language_model = init_vllm_registered_model(
263+
vllm_config=vllm_config,
264+
hf_config=config.text_config,
265+
prefix=maybe_prefix(prefix, "language_model"),
266+
architectures=["Qwen2ForCausalLM"],
267+
)
268+
269+
self.make_empty_intermediate_tensors = (
270+
self.language_model.make_empty_intermediate_tensors
271+
)

vllm/model_executor/models/registry.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,10 @@
289289
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
290290
"InternVLChatModel": ("internvl", "InternVLChatModel"),
291291
"NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
292+
"OpenCUAForConditionalGeneration": (
293+
"opencua",
294+
"OpenCUAForConditionalGeneration",
295+
),
292296
"InternS1ForConditionalGeneration": (
293297
"interns1",
294298
"InternS1ForConditionalGeneration",

0 commit comments

Comments
 (0)