mm: add qwen vl2.5 model support. (#86)

kzjeef · web-flow · commit a259d6211b8f · 2025-05-28T07:34:48.000Z
- add qwen vl 2.5 model support. - Qwen VL2.5 only support 'transformers' as vit engine, (trt not support yet.) - upgrade package version to make sure VL2.5 code is added. test command: server: `dashinfer_vlm_serve --model qwen/Qwen2.5-VL-3B-Instruct --vision_engine transformers --port 8000 --host=127.0.0.1` client: ``` curl http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d \ '{"model": "qwen/Qwen2.5-VL-3B-Instruct", "messages": [{"role": "user", "content": [{ "type": "text", "text": "Describe the image." }, {"type": "image_url", "image_url": {"url": "https://farm4.staticflickr.com/3075/3168662394_7d7103de7d_z_d.jpg"}}]}], "max_completion_tokens": 1024, "top_p": 0.5, "temperature": 0.1, "frequency_penalty": 1.05 }' ``` result: ``` {"id":"chatcmpl-rxqDiCQEJweEeeB7FADiER","object":"chat.completion", "created":1747992522,"model":"model","choices":[{"index":0,"message":{"role":"assistant","content":"The image features a small hummingbird perched on a branch. The bird is positioned in the center of the scene, with its vibrant colors and delicate features clearly visible. The hummingbird appears to be enjoying its time in nature, possibly searching for food or simply resting on the branch. \n\nThere are no other birds or animals present in the image, making it a solitary moment captured in this natural setting."},"finish_reason":"stop"}],"usage":{"prompt_tokens":382,"total_tokens":95,"completion_tokens":81}} ```
diff --git a/multimodal/dashinfer_vlm/vl_inference/utils/model_loader.py b/multimodal/dashinfer_vlm/vl_inference/utils/model_loader.py
@@ -6,10 +6,14 @@
 import torch
 import glob
 import warnings
-from modelscope import snapshot_download
-from transformers import Qwen2VLForConditionalGeneration, AutoConfig, AutoTokenizer
-from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
 from tqdm import tqdm
+
+from transformers import AutoConfig, AutoTokenizer, AutoProcessor
+
+from transformers import Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
+from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+
 from safetensors.torch import safe_open
 from dashinfer import allspark
 from dashinfer.allspark.model_loader import HuggingFaceModel, ModelSerializerException
@@ -59,25 +63,58 @@ def load_model(
             # the open-source model can be loaded by huggingface
             try:
                 if not os.path.isdir(self.hf_model_path):
+                    from modelscope import snapshot_download
                     self.hf_model_path = snapshot_download(self.hf_model_path)
-                self.torch_model = Qwen2VLForConditionalGeneration.from_pretrained(
-                    self.hf_model_path,
-                    trust_remote_code=self.trust_remote_code,
-                    torch_dtype=dtype_to_torch_dtype(self.data_type),
-                    device_map="cpu",
-                    **kwargs,
-                ).eval()
-                self.vit_config = Qwen2VLVisionConfig.from_pretrained(
-                    self.hf_model_path,
-                    trust_remote_code=True,
-                    revision=None,
-                    code_revision=None,
-                )
-                self.tokenizer = AutoTokenizer.from_pretrained(
-                    self.hf_model_path,
-                    trust_remote_code=self.trust_remote_code,
-                    **kwargs,
+
+                # Read config to determine model architecture
+                self.hf_model_config = AutoConfig.from_pretrained(
+                    self.hf_model_path, trust_remote_code=self.trust_remote_code
                 )
+
+                if hasattr(self.hf_model_config, "architectures") and "Qwen2_5_VLForConditionalGeneration" in self.hf_model_config.architectures:
+                    self.torch_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=self.trust_remote_code,
+                        torch_dtype=dtype_to_torch_dtype(self.data_type),
+                        device_map="cpu",
+                        **kwargs,
+                    ).eval()
+                    self.tokenizer = AutoTokenizer.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=self.trust_remote_code,
+                        **kwargs,
+                    )
+                    self.processor = AutoProcessor.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=self.trust_remote_code,
+                        **kwargs,
+                    )
+                    self.vit_config = Qwen2_5_VLVisionConfig.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=True,
+                        revision=None,
+                        code_revision=None,
+                    )
+                else:
+                    self.torch_model = Qwen2VLForConditionalGeneration.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=self.trust_remote_code,
+                        torch_dtype=dtype_to_torch_dtype(self.data_type),
+                        device_map="cpu",
+                        **kwargs,
+                    ).eval()
+                    self.tokenizer = AutoTokenizer.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=self.trust_remote_code,
+                        **kwargs,
+                    )
+                    self.vit_config = Qwen2VLVisionConfig.from_pretrained(
+                        self.hf_model_path,
+                        trust_remote_code=True,
+                        revision=None,
+                        code_revision=None,
+                    )
+                pass
             except Exception as e:
                 print(
                     f"exception when load model: {self.hf_model_path} , exception: {e}"
@@ -102,10 +139,10 @@ def read_model_config(self):
             self.hf_model_config = AutoConfig.from_pretrained(
                 self.hf_model_path, trust_remote_code=self.trust_remote_code
             )
-            self.adapter = QWen2ConfigAdapter(self.hf_model_config)
-            self.as_model_config = self.adapter.model_config
-            if self.user_set_data_type is None:
-                self.data_type = self.adapter.get_model_data_type()
+        self.adapter = QWen2ConfigAdapter(self.hf_model_config)
+        self.as_model_config = self.adapter.model_config
+        if self.user_set_data_type is None:
+            self.data_type = self.adapter.get_model_data_type()
         return self
 
     def serialize(
@@ -127,17 +164,26 @@ def serialize(
             onnx_trt_obj.export_onnx(onnxFile)
             onnx_trt_obj.generate_trt_engine(onnxFile, self.vision_model_path)
         elif self.vision_engine == "transformers":
-            visual_model = Qwen2VLForConditionalGeneration.from_pretrained(
+            if hasattr(self.hf_model_config, "architectures") and "Qwen2_5_VLForConditionalGeneration" in self.hf_model_config.architectures:
+                visual_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                     self.hf_model_path,
                     trust_remote_code=self.trust_remote_code,
                     torch_dtype=dtype_to_torch_dtype(self.data_type),
-                    device_map="cpu",
-                    attn_implementation="flash_attention_2",
+                    device_map="auto",
+                    attn_implementation="sdpa",
+                ).visual.eval()
+            else:
+                visual_model = Qwen2VLForConditionalGeneration.from_pretrained(
+                    self.hf_model_path,
+                    trust_remote_code=self.trust_remote_code,
+                    torch_dtype=dtype_to_torch_dtype(self.data_type),
+                    device_map="auto",
+                    attn_implementation="sdpa",
                 ).visual.eval()
             self.vision_model_path = visual_model
         else:
             raise ValueError(f"unsupported engine {self.vision_engine}")
-        
+
         # Convert Allspark LLM
         enable_quant = False
         weight_only_quant=False
diff --git a/multimodal/requirements.txt b/multimodal/requirements.txt
@@ -1,9 +1,9 @@
 dashinfer@https://github.com/modelscope/dash-infer/releases/download/v2.0.0-rc3/dashinfer-2.0.0rc3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
 av
-numpy==1.24.3
-requests==2.32.3
-nvtx==0.2.10
-transformers>=4.45.0
+numpy>=1.24.3
+requests>=2.32.3
+nvtx>=0.2.10
+transformers>=4.48.9
 cachetools>=5.4.0
 six
 tiktoken
@@ -12,7 +12,7 @@ shortuuid
 fastapi
 pydantic_settings
 uvicorn
-cmake==3.22.6
+cmake>=3.22.6
 modelscope
 aiohttp
 onnx