update model & fix CI.

foldl · foldl · commit f4ec2570faf1 · 2025-10-26T10:36:31.000+08:00
diff --git a/docs/models.md b/docs/models.md
@@ -124,6 +124,12 @@
 * LLaDA (`LLaDA2MoeModelLM`)
     * [x] [mini-preview](https://huggingface.co/inclusionAI/LLaDA2.0-mini-preview/tree/d25d3b2ac0b966b64da11d6c791f8bf4bc31e90c)
 
+        Supported options (`--set OPTION VALUE`):
+        - `block_length`: default 32
+        - `steps`: default 32
+        - `minimal_topk`: default 1
+        - `threshold`: default 0.95
+
 * LlaMA-like (`LlamaForCausalLM`, `Llama4ForConditionalGeneration`):
     * [x] All LlaMA-1 models
     * [x] LlaMA-2: [Chat-7B](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), etc
diff --git a/models/bailing.cpp b/models/bailing.cpp
@@ -1,5 +1,6 @@
 #include <algorithm>
 #include <numeric>
+#include <cstring>
 #include <functional>
 #include "deepseek.h"
 #include "qwen.h"
@@ -464,8 +465,6 @@ namespace chatllm::bailing::llada
         if (gen_max_tokens > 0)
             gen_max_tokens = n_past + (int)curr_input_ids.size() + gen_max_tokens;
 
-        bool first_call = true;
-
         if (performance)
             performance->Reset();
 
@@ -516,7 +515,7 @@ namespace chatllm::bailing::llada
             }
         }
 
-        std::memcpy(block_result.data(), curr_input_ids.data(), curr_input_ids.size() * sizeof(curr_input_ids[0]));
+        memcpy(block_result.data(), curr_input_ids.data(), curr_input_ids.size() * sizeof(curr_input_ids[0]));
         int next_pos_to_add = (int)(curr_input_ids.size());
         int block_prefilled_size = next_pos_to_add;
         curr_input_ids.clear();
@@ -525,7 +524,7 @@ namespace chatllm::bailing::llada
         {
             for (int step = 0; !completed && (step < steps); step++)
             {
-                // TODO: don't re-run "known" tokens again and again.
+                // Note: we have to run a whole block again and again.
                 std::vector<float> lm_logits;
                 generate_next_block(block_result.data(), block_length, gen_config, &lm_logits, nullptr);
 
diff --git a/models/dots.cpp b/models/dots.cpp
@@ -424,7 +424,6 @@ namespace chatllm::dots::ocr
 
                 vision::image_arrange(scaled, w, patch_size, image.data, vision::PatchesFormat::PatchesLeftRightDown_MergeN_ChannelsRGB_PixelsLeftRightDown);
 
-                const int merge_length = vis_config->spatial_merge_size * vis_config->spatial_merge_size;
                 image.emb_vec_number = image.grid_width * image.grid_height;
 
                 const int id_start = tok->get_image_total_emb_vectors() - image.emb_vec_number + tok->vocab_size;
@@ -556,7 +555,6 @@ namespace chatllm::dots::ocr
 
     void ConditionalGeneration::set_additional_args(const std::map<std::string, std::string> &args)
     {
-        Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
     }
 
     int64_t ConditionalGeneration::get_param_num(bool effective_only) const
diff --git a/scripts/models.json b/scripts/models.json
@@ -3450,5 +3450,21 @@
                 }
             }
         }
+    },
+    "llada2.0": {
+        "brief": "LLaDA2.0-mini-preview is a diffusion language model featuring a 16BA1B Mixture-of-Experts (MoE) architecture.",
+        "default": "mini-preview",
+        "license": "Apache License 2.0",
+        "variants": {
+            "mini-preview": {
+                "default": "q8",
+                "quantized": {
+                    "q8": {
+                        "size": 17277819200,
+                        "url": "chatllm_quantized_bailing/llada2.0-mini-preview.bin"
+                    }
+                }
+            }
+        }
     }
 }
diff --git a/src/models.cpp b/src/models.cpp
@@ -1474,7 +1474,7 @@ namespace chatllm
     {
         const int qlen  = ggml::get_dim(hidden_states, 1);
         const int batch = ggml::get_dim(hidden_states, 2);
-        CHATLLM_CHECK(qlen >= last_n);
+        const int last_n = qlen >= this->last_n ? this->last_n : qlen;
         order = nullptr;
 
         hidden_states = ggml::view_3d(ctx, hidden_states, model->hidden_size, last_n, batch,

Original file line number	Diff line number	Diff line change
`@@ -424,7 +424,6 @@ namespace chatllm::dots::ocr`
`424`	`424`
`425`	`425`	`vision::image_arrange(scaled, w, patch_size, image.data, vision::PatchesFormat::PatchesLeftRightDown_MergeN_ChannelsRGB_PixelsLeftRightDown);`
`426`	`426`
`427`		`- const int merge_length = vis_config->spatial_merge_size * vis_config->spatial_merge_size;`
`428`	`427`	`image.emb_vec_number = image.grid_width * image.grid_height;`
`429`	`428`
`430`	`429`	`const int id_start = tok->get_image_total_emb_vectors() - image.emb_vec_number + tok->vocab_size;`
`@@ -556,7 +555,6 @@ namespace chatllm::dots::ocr`
`556`	`555`
`557`	`556`	`void ConditionalGeneration::set_additional_args(const std::map<std::string, std::string> &args)`
`558`	`557`	`{`
`559`		`- Tokenizer tok = dynamic_cast<Tokenizer >(tokenizer);`
`560`	`558`	`}`
`561`	`559`
`562`	`560`	`int64_t ConditionalGeneration::get_param_num(bool effective_only) const`
Original file line number	Diff line number	Diff line change
`@@ -3450,5 +3450,21 @@`
`3450`	`3450`	`}`
`3451`	`3451`	`}`
`3452`	`3452`	`}`
	`3453`	`+ },`
	`3454`	`+ "llada2.0": {`
	`3455`	`+ "brief": "LLaDA2.0-mini-preview is a diffusion language model featuring a 16BA1B Mixture-of-Experts (MoE) architecture.",`
	`3456`	`+ "default": "mini-preview",`
	`3457`	`+ "license": "Apache License 2.0",`
	`3458`	`+ "variants": {`
	`3459`	`+ "mini-preview": {`
	`3460`	`+ "default": "q8",`
	`3461`	`+ "quantized": {`
	`3462`	`+ "q8": {`
	`3463`	`+ "size": 17277819200,`
	`3464`	`+ "url": "chatllm_quantized_bailing/llada2.0-mini-preview.bin"`
	`3465`	`+ }`
	`3466`	`+ }`
	`3467`	`+ }`
	`3468`	`+ }`
`3453`	`3469`	`}`
`3454`	`3470`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1474,7 +1474,7 @@ namespace chatllm`
`1474`	`1474`	`{`
`1475`	`1475`	`const int qlen = ggml::get_dim(hidden_states, 1);`
`1476`	`1476`	`const int batch = ggml::get_dim(hidden_states, 2);`
`1477`		`- CHATLLM_CHECK(qlen >= last_n);`
	`1477`	`+ const int last_n = qlen >= this->last_n ? this->last_n : qlen;`
`1478`	`1478`	`order = nullptr;`
`1479`	`1479`
`1480`	`1480`	`hidden_states = ggml::view_3d(ctx, hidden_states, model->hidden_size, last_n, batch,`