InfiniTensor · Ceng23333 · Nov 27, 2025 · Dec 1, 2025 · PanZezhong1725 · Nov 27, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 # Xmake cache
 .xmake/
 build/
+python/infinilm/lib/*.so
 
 # MacOS Cache
 .DS_Store
@@ -10,12 +11,13 @@ build/
 
 # Python
 __pycache__/
+*.egg-info/
 
 # Log
 *.log
 
 # Cache
-cache/
+.cache/
 
 # JSON
 *.json

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third_party/spdlog"]
+	path = third_party/spdlog
+	url = https://github.com/gabime/spdlog.git
diff --git a/csrc/cache/kv_cache.hpp b/csrc/cache/kv_cache.hpp
@@ -0,0 +1,116 @@
+#pragma once
+
+#include "infinicore/tensor.hpp"
+#include "infinicore/device.hpp"
+#include <algorithm>
+#include <utility>
+#include <memory>
+
+namespace infinilm::cache {
+
+/**
+ * @brief Simple KV cache structure for incremental decoding
+ *
+ * Stores key and value caches with shape [n_kv_head, capacity, head_dim]
+ * Similar to DynamicLayer in Python cache_utils.py
+ *
+ * This is a common component that can be used by any model architecture
+ * that needs KV caching for attention mechanisms.
+ */
+struct KVCache {
+    infinicore::Tensor k_cache;  // [n_kv_head, capacity, head_dim]
+    infinicore::Tensor v_cache;  // [n_kv_head, capacity, head_dim]
+    size_t cache_position;        // Current position in cache
+    size_t max_capacity;          // Maximum capacity of cache
+    bool initialized;             // Whether cache has been initialized
+
+    KVCache()
+        : cache_position(0), max_capacity(0), initialized(false),
+          // Create empty placeholder tensors (will be replaced on first use)
+          k_cache(infinicore::Tensor::empty({1, 1, 1}, infinicore::DataType::F32,
+                                            infinicore::Device(infinicore::Device::Type::CPU, 0))),
+          v_cache(infinicore::Tensor::empty({1, 1, 1}, infinicore::DataType::F32,
+                                            infinicore::Device(infinicore::Device::Type::CPU, 0))) {}
+
+    /**
+     * @brief Initialize or update cache capacity
+     * @param num_kv_heads Number of key-value heads
+     * @param head_dim Head dimension
+     * @param seq_len Sequence length of new tokens
+     * @param dtype Data type
+     * @param device Device
+     */
+    void ensure_capacity(size_t num_kv_heads, size_t head_dim, size_t seq_len,
+                        infinicore::DataType dtype, const infinicore::Device &device) {
+        size_t required_capacity = cache_position + seq_len;
+
+        // Lazy initialization
+        if (!initialized) {
+            max_capacity = std::max(required_capacity, size_t(4096));  // Start with at least 4096
+            k_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim},
+                                                dtype, device);
+            v_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim},
+                                                dtype, device);
+            cache_position = 0;
+            initialized = true;
+        }
+        // Grow cache if needed (similar to DynamicLayer in Python)
+        else if (required_capacity > max_capacity) {
+            size_t new_capacity = std::max(max_capacity * 2, required_capacity);
+            auto k_new = infinicore::Tensor::empty({num_kv_heads, new_capacity, head_dim},
+                                                   dtype, device);
+            auto v_new = infinicore::Tensor::empty({num_kv_heads, new_capacity, head_dim},
+                                                   dtype, device);
+
+            // Copy existing cache data
+            if (cache_position > 0) {
+                auto k_slice = k_cache->narrow({{1, 0, cache_position}});
+                auto v_slice = v_cache->narrow({{1, 0, cache_position}});
+                k_new->narrow({{1, 0, cache_position}})->copy_from(k_slice);
+                v_new->narrow({{1, 0, cache_position}})->copy_from(v_slice);
+            }
+
+            k_cache = k_new;
+            v_cache = v_new;
+            max_capacity = new_capacity;
+        }
+    }
+
+    /**
+     * @brief Update cache with new key and value states
+     * @param k_new New key states [n_kv_head, seq_len, head_dim]
+     * @param v_new New value states [n_kv_head, seq_len, head_dim]
+     * @return Tuple of (k_total, v_total) with shape [n_kv_head, total_seq_len, head_dim]
+     *
+     * Note: This method writes to the cache. If using with attention op, the attention op
+     * also writes to the cache, so this should be called AFTER attention, not before.
+     */
+    std::pair<infinicore::Tensor, infinicore::Tensor> update(
+        const infinicore::Tensor &k_new,
+        const infinicore::Tensor &v_new) {
+        size_t seq_len = k_new->shape()[1];
+        size_t num_kv_heads = k_new->shape()[0];
+        size_t head_dim = k_new->shape()[2];
+
+        // Ensure capacity
+        ensure_capacity(num_kv_heads, head_dim, seq_len,
+                       k_new->dtype(), k_new->device());
+
+        // Copy new k/v into cache at current position
+        auto k_dst = k_cache->narrow({{1, cache_position, seq_len}});
+        auto v_dst = v_cache->narrow({{1, cache_position, seq_len}});
+        k_dst->copy_from(k_new);
+        v_dst->copy_from(v_new);
+
+        // Update position
+        cache_position += seq_len;
+
+        // Return the total cache up to current position
+        auto k_total = k_cache->narrow({{1, 0, cache_position}});
+        auto v_total = v_cache->narrow({{1, 0, cache_position}});
+
+        return std::make_pair(k_total->contiguous(), v_total->contiguous());
+    }
+};
+
+} // namespace infinilm::models::common
diff --git a/csrc/models/debug_utils/hooks.cpp b/csrc/models/debug_utils/hooks.cpp
@@ -0,0 +1,44 @@
+#include "hooks.hpp"
+#include <spdlog/spdlog.h>
+
+namespace infinilm::models::debug_utils {
+
+void HookRegistry::register_hook(const std::string &name, HookCallback callback) {
+    hooks_[name] = callback;
+    SPDLOG_DEBUG("HookRegistry: Registered hook '{}'", name);
+}
+
+void HookRegistry::call_hook(const std::string &name, const infinicore::Tensor &tensor, int layer_idx) const {
+    // Try exact match first
+    auto it = hooks_.find(name);
+    if (it != hooks_.end()) {
+        try {
+            it->second(name, tensor, layer_idx);
+        } catch (const std::exception &e) {
+            SPDLOG_ERROR("HookRegistry: Error calling hook '{}': {}", name, e.what());
+        }
+        return;
+    }
+
+    // Try pattern matching (e.g., "layer0_*" matches "layer0_q_after_proj")
+    for (const auto &[pattern, callback] : hooks_) {
+        if (pattern.back() == '*' && name.size() >= pattern.size() - 1) {
+            std::string prefix = pattern.substr(0, pattern.size() - 1);
+            if (name.substr(0, prefix.size()) == prefix) {
+                try {
+                    callback(name, tensor, layer_idx);
+                } catch (const std::exception &e) {
+                    SPDLOG_ERROR("HookRegistry: Error calling hook pattern '{}' for '{}': {}", pattern, name, e.what());
+                }
+                return;
+            }
+        }
+    }
+}
+
+void HookRegistry::clear() {
+    hooks_.clear();
+    SPDLOG_DEBUG("HookRegistry: Cleared all hooks");
+}
+
+} // namespace infinilm::models::debug_utils
diff --git a/csrc/models/debug_utils/hooks.hpp b/csrc/models/debug_utils/hooks.hpp
@@ -0,0 +1,186 @@
+#pragma once
+
+#include "infinicore/tensor.hpp"
+#include <functional>
+#include <string>
+#include <memory>
+#include <unordered_map>
+
+namespace infinilm::models::debug_utils {
+
+// TODO: move to InfiniCore as common utils in future work
+
+/**
+ * @brief Hook callback type for capturing intermediate values (DEBUG ONLY)
+ *
+ * Hook functions are called with:
+ * - name: Identifier for the intermediate value (e.g., "layer0_q_after_proj")
+ * - tensor: The intermediate tensor value
+ * - layer_idx: Layer index (for layer-specific hooks, -1 if not applicable)
+ *
+ * NOTE: This is a debug utility. Do not use in production code.
+ */
+using HookCallback = std::function<void(const std::string &name, const infinicore::Tensor &tensor, int layer_idx)>;
+
+/**
+ * @brief Hook registry for managing hooks (DEBUG ONLY)
+ *
+ * NOTE: This is a debug utility for capturing intermediate tensor values
+ * during model execution. Do not use in production code.
+ */
+class HookRegistry {
+public:
+    /**
+     * @brief Register a hook callback
+     *
+     * @param name Hook name (can be pattern like "layer0_*" or specific name)
+     * @param callback Hook callback function
+     */
+    void register_hook(const std::string &name, HookCallback callback);
+
+    /**
+     * @brief Call hook if registered
+     *
+     * @param name Full hook name
+     * @param tensor Tensor to pass to hook
+     * @param layer_idx Layer index (-1 if not applicable)
+     */
+    void call_hook(const std::string &name, const infinicore::Tensor &tensor, int layer_idx = -1) const;
+
+    /**
+     * @brief Clear all hooks
+     */
+    void clear();
+
+    /**
+     * @brief Check if any hooks are registered
+     */
+    bool has_hooks() const { return !hooks_.empty(); }
+
+private:
+    std::unordered_map<std::string, HookCallback> hooks_;
+};
+
+/**
+ * @brief Macro to simplify hook registration (DEBUG ONLY)
+ *
+ * Usage: REGISTER_HOOK(registry, "hook_name", callback)
+ */
+#define REGISTER_HOOK(registry, name, callback) \
+    (registry)->register_hook(name, callback)
+
+/**
+ * @brief Macro to simplify hook calls with automatic null and has_hooks checks (DEBUG ONLY)
+ *
+ * Usage: CALL_HOOK(registry, "hook_name", tensor)
+ *        Note: layer_idx defaults to -1
+ */
+#define CALL_HOOK(registry, name, tensor) \
+    do { \
+        if ((registry) && (registry)->has_hooks()) { \
+            (registry)->call_hook(name, tensor, -1); \
+        } \
+    } while (0)
+
+/**
+ * @brief Macro to simplify hook calls with explicit layer index (DEBUG ONLY)
+ *
+ * Usage: CALL_HOOK_LAYER(registry, "hook_name", tensor, layer_idx)
+ */
+#define CALL_HOOK_LAYER(registry, name, tensor, layer_idx) \
+    do { \
+        if ((registry) && (registry)->has_hooks()) { \
+            (registry)->call_hook(name, tensor, layer_idx); \
+        } \
+    } while (0)
+
+/**
+ * @brief Macros to simplify hook_registry and hook_prefix management in model classes
+ */
+
+// Declare hook_registry and hook_prefix member variables
+#define HOOK_REGISTRY_MEMBER() \
+    std::shared_ptr<debug_utils::HookRegistry> hook_registry_; \
+    std::string hook_prefix_;
+
+// Set hook_registry and hook_prefix (no forwarding to submodules)
+#define SET_HOOK_REGISTRY_SIMPLE() \
+    void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
+        hook_registry_ = hook_registry; \
+        hook_prefix_ = hook_prefix; \
+    }
+
+// Helper macro to build incremental hook prefix
+#define BUILD_HOOK_PREFIX(prefix, name) \
+    (prefix.empty() ? std::string(name) : prefix + "_" + std::string(name))
+
+// Set hook_registry and hook_prefix and forward to one or more submodules
+// Usage: SET_HOOK_REGISTRY(submodule1) or SET_HOOK_REGISTRY(submodule1, submodule2)
+// The hook_prefix will be incremented for each submodule (e.g., "layer0" -> "layer0_attention")
+// Note: Currently supports up to 2 submodules. For more, extend the pattern below.
+#define SET_HOOK_REGISTRY(...) \
+    SET_HOOK_REGISTRY_IMPL(__VA_ARGS__)
+
+// Helper to handle variable number of arguments using a reliable pattern
+#define SET_HOOK_REGISTRY_IMPL(...) \
+    SET_HOOK_REGISTRY_GET_NTH(__VA_ARGS__, SET_HOOK_REGISTRY_2, SET_HOOK_REGISTRY_1, SET_HOOK_REGISTRY_0,)(__VA_ARGS__)
+
+// Get the selector based on argument count
+// Pattern: when we have N args, the (N+1)th parameter from the end is the selector
+// For 0 args: _1=SET_HOOK_REGISTRY_2, _2=SET_HOOK_REGISTRY_1, _3=SET_HOOK_REGISTRY_0, N=(empty) → need to use _3
+// For 1 arg: _1=arg, _2=SET_HOOK_REGISTRY_2, _3=SET_HOOK_REGISTRY_1, N=SET_HOOK_REGISTRY_0 → wrong, need _3
+// For 2 args: _1=arg1, _2=arg2, _3=SET_HOOK_REGISTRY_2, N=SET_HOOK_REGISTRY_1 → wrong, need _3
+
+// Use _3 as the selector (it's in the right position for all cases)
+#define SET_HOOK_REGISTRY_GET_NTH(_1, _2, _3, N, ...) _3
+
+// Implementation for 0 args (shouldn't be used, but handle gracefully)
+#define SET_HOOK_REGISTRY_0() \
+    void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
+        hook_registry_ = hook_registry; \
+        hook_prefix_ = hook_prefix; \
+    }
+
+// Implementation for 1 arg
+#define SET_HOOK_REGISTRY_1(submodule) \
+    void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
+        hook_registry_ = hook_registry; \
+        hook_prefix_ = hook_prefix; \
+        if (submodule##_) { \
+            std::string submodule_prefix = BUILD_HOOK_PREFIX(hook_prefix, #submodule); \
+            submodule##_->set_hook_registry(hook_registry, submodule_prefix); \
+        } \
+    }
+
+// Implementation for 2 args
+#define SET_HOOK_REGISTRY_2(submodule1, submodule2) \
+    void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
+        hook_registry_ = hook_registry; \
+        hook_prefix_ = hook_prefix; \
+        if (submodule1##_) { \
+            std::string submodule1_prefix = BUILD_HOOK_PREFIX(hook_prefix, #submodule1); \
+            submodule1##_->set_hook_registry(hook_registry, submodule1_prefix); \
+        } \
+        if (submodule2##_) { \
+            std::string submodule2_prefix = BUILD_HOOK_PREFIX(hook_prefix, #submodule2); \
+            submodule2##_->set_hook_registry(hook_registry, submodule2_prefix); \
+        } \
+    }
+
+// Set hook_registry and hook_prefix for a vector of submodules
+// For vectors, the prefix is incremented with an index (e.g., "layer0", "layer1", ...)
+// If parent has a prefix, it becomes "parent_layer0", "parent_layer1", etc.
+#define SET_HOOK_REGISTRY_VEC(vec_name) \
+    void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
+        hook_registry_ = hook_registry; \
+        hook_prefix_ = hook_prefix; \
+        for (size_t i = 0; i < vec_name##_.size(); ++i) { \
+            if (vec_name##_[i]) { \
+                std::string layer_name = "layer" + std::to_string(i); \
+                std::string item_prefix = BUILD_HOOK_PREFIX(hook_prefix, layer_name); \
+                vec_name##_[i]->set_hook_registry(hook_registry, item_prefix); \
+            } \
+        } \
+    }
+
+} // namespace infinilm::models::debug_utils