Skip to content

Commit 37f13f4

Browse files
committed
issue/74 add c++ Llama models and align to AutoLlama interface
Signed-off-by: Ceng23333 <[email protected]>
1 parent efc19d7 commit 37f13f4

32 files changed

+4849
-23
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Xmake cache
22
.xmake/
33
build/
4+
python/infinilm/lib/*.so
45

56
# MacOS Cache
67
.DS_Store
@@ -10,12 +11,13 @@ build/
1011

1112
# Python
1213
__pycache__/
14+
*.egg-info/
1315

1416
# Log
1517
*.log
1618

1719
# Cache
18-
cache/
20+
.cache/
1921

2022
# JSON
2123
*.json

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "third_party/spdlog"]
2+
path = third_party/spdlog
3+
url = https://github.com/gabime/spdlog.git

csrc/cache/kv_cache.hpp

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#pragma once
2+
3+
#include "infinicore/tensor.hpp"
4+
#include "infinicore/device.hpp"
5+
#include <algorithm>
6+
#include <utility>
7+
#include <memory>
8+
9+
namespace infinilm::cache {
10+
11+
/**
12+
* @brief Simple KV cache structure for incremental decoding
13+
*
14+
* Stores key and value caches with shape [n_kv_head, capacity, head_dim]
15+
* Similar to DynamicLayer in Python cache_utils.py
16+
*
17+
* This is a common component that can be used by any model architecture
18+
* that needs KV caching for attention mechanisms.
19+
*/
20+
struct KVCache {
21+
infinicore::Tensor k_cache; // [n_kv_head, capacity, head_dim]
22+
infinicore::Tensor v_cache; // [n_kv_head, capacity, head_dim]
23+
size_t cache_position; // Current position in cache
24+
size_t max_capacity; // Maximum capacity of cache
25+
bool initialized; // Whether cache has been initialized
26+
27+
KVCache()
28+
: cache_position(0), max_capacity(0), initialized(false),
29+
// Create empty placeholder tensors (will be replaced on first use)
30+
k_cache(infinicore::Tensor::empty({1, 1, 1}, infinicore::DataType::F32,
31+
infinicore::Device(infinicore::Device::Type::CPU, 0))),
32+
v_cache(infinicore::Tensor::empty({1, 1, 1}, infinicore::DataType::F32,
33+
infinicore::Device(infinicore::Device::Type::CPU, 0))) {}
34+
35+
/**
36+
* @brief Initialize or update cache capacity
37+
* @param num_kv_heads Number of key-value heads
38+
* @param head_dim Head dimension
39+
* @param seq_len Sequence length of new tokens
40+
* @param dtype Data type
41+
* @param device Device
42+
*/
43+
void ensure_capacity(size_t num_kv_heads, size_t head_dim, size_t seq_len,
44+
infinicore::DataType dtype, const infinicore::Device &device) {
45+
size_t required_capacity = cache_position + seq_len;
46+
47+
// Lazy initialization
48+
if (!initialized) {
49+
max_capacity = std::max(required_capacity, size_t(4096)); // Start with at least 4096
50+
k_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim},
51+
dtype, device);
52+
v_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim},
53+
dtype, device);
54+
cache_position = 0;
55+
initialized = true;
56+
}
57+
// Grow cache if needed (similar to DynamicLayer in Python)
58+
else if (required_capacity > max_capacity) {
59+
size_t new_capacity = std::max(max_capacity * 2, required_capacity);
60+
auto k_new = infinicore::Tensor::empty({num_kv_heads, new_capacity, head_dim},
61+
dtype, device);
62+
auto v_new = infinicore::Tensor::empty({num_kv_heads, new_capacity, head_dim},
63+
dtype, device);
64+
65+
// Copy existing cache data
66+
if (cache_position > 0) {
67+
auto k_slice = k_cache->narrow({{1, 0, cache_position}});
68+
auto v_slice = v_cache->narrow({{1, 0, cache_position}});
69+
k_new->narrow({{1, 0, cache_position}})->copy_from(k_slice);
70+
v_new->narrow({{1, 0, cache_position}})->copy_from(v_slice);
71+
}
72+
73+
k_cache = k_new;
74+
v_cache = v_new;
75+
max_capacity = new_capacity;
76+
}
77+
}
78+
79+
/**
80+
* @brief Update cache with new key and value states
81+
* @param k_new New key states [n_kv_head, seq_len, head_dim]
82+
* @param v_new New value states [n_kv_head, seq_len, head_dim]
83+
* @return Tuple of (k_total, v_total) with shape [n_kv_head, total_seq_len, head_dim]
84+
*
85+
* Note: This method writes to the cache. If using with attention op, the attention op
86+
* also writes to the cache, so this should be called AFTER attention, not before.
87+
*/
88+
std::pair<infinicore::Tensor, infinicore::Tensor> update(
89+
const infinicore::Tensor &k_new,
90+
const infinicore::Tensor &v_new) {
91+
size_t seq_len = k_new->shape()[1];
92+
size_t num_kv_heads = k_new->shape()[0];
93+
size_t head_dim = k_new->shape()[2];
94+
95+
// Ensure capacity
96+
ensure_capacity(num_kv_heads, head_dim, seq_len,
97+
k_new->dtype(), k_new->device());
98+
99+
// Copy new k/v into cache at current position
100+
auto k_dst = k_cache->narrow({{1, cache_position, seq_len}});
101+
auto v_dst = v_cache->narrow({{1, cache_position, seq_len}});
102+
k_dst->copy_from(k_new);
103+
v_dst->copy_from(v_new);
104+
105+
// Update position
106+
cache_position += seq_len;
107+
108+
// Return the total cache up to current position
109+
auto k_total = k_cache->narrow({{1, 0, cache_position}});
110+
auto v_total = v_cache->narrow({{1, 0, cache_position}});
111+
112+
return std::make_pair(k_total->contiguous(), v_total->contiguous());
113+
}
114+
};
115+
116+
} // namespace infinilm::models::common

csrc/models/debug_utils/hooks.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#include "hooks.hpp"
2+
#include <spdlog/spdlog.h>
3+
4+
namespace infinilm::models::debug_utils {
5+
6+
void HookRegistry::register_hook(const std::string &name, HookCallback callback) {
7+
hooks_[name] = callback;
8+
SPDLOG_DEBUG("HookRegistry: Registered hook '{}'", name);
9+
}
10+
11+
void HookRegistry::call_hook(const std::string &name, const infinicore::Tensor &tensor, int layer_idx) const {
12+
// Try exact match first
13+
auto it = hooks_.find(name);
14+
if (it != hooks_.end()) {
15+
try {
16+
it->second(name, tensor, layer_idx);
17+
} catch (const std::exception &e) {
18+
SPDLOG_ERROR("HookRegistry: Error calling hook '{}': {}", name, e.what());
19+
}
20+
return;
21+
}
22+
23+
// Try pattern matching (e.g., "layer0_*" matches "layer0_q_after_proj")
24+
for (const auto &[pattern, callback] : hooks_) {
25+
if (pattern.back() == '*' && name.size() >= pattern.size() - 1) {
26+
std::string prefix = pattern.substr(0, pattern.size() - 1);
27+
if (name.substr(0, prefix.size()) == prefix) {
28+
try {
29+
callback(name, tensor, layer_idx);
30+
} catch (const std::exception &e) {
31+
SPDLOG_ERROR("HookRegistry: Error calling hook pattern '{}' for '{}': {}", pattern, name, e.what());
32+
}
33+
return;
34+
}
35+
}
36+
}
37+
}
38+
39+
void HookRegistry::clear() {
40+
hooks_.clear();
41+
SPDLOG_DEBUG("HookRegistry: Cleared all hooks");
42+
}
43+
44+
} // namespace infinilm::models::debug_utils

csrc/models/debug_utils/hooks.hpp

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
#pragma once
2+
3+
#include "infinicore/tensor.hpp"
4+
#include <functional>
5+
#include <string>
6+
#include <memory>
7+
#include <unordered_map>
8+
9+
namespace infinilm::models::debug_utils {
10+
11+
// TODO: move to InfiniCore as common utils in future work
12+
13+
/**
14+
* @brief Hook callback type for capturing intermediate values (DEBUG ONLY)
15+
*
16+
* Hook functions are called with:
17+
* - name: Identifier for the intermediate value (e.g., "layer0_q_after_proj")
18+
* - tensor: The intermediate tensor value
19+
* - layer_idx: Layer index (for layer-specific hooks, -1 if not applicable)
20+
*
21+
* NOTE: This is a debug utility. Do not use in production code.
22+
*/
23+
using HookCallback = std::function<void(const std::string &name, const infinicore::Tensor &tensor, int layer_idx)>;
24+
25+
/**
26+
* @brief Hook registry for managing hooks (DEBUG ONLY)
27+
*
28+
* NOTE: This is a debug utility for capturing intermediate tensor values
29+
* during model execution. Do not use in production code.
30+
*/
31+
class HookRegistry {
32+
public:
33+
/**
34+
* @brief Register a hook callback
35+
*
36+
* @param name Hook name (can be pattern like "layer0_*" or specific name)
37+
* @param callback Hook callback function
38+
*/
39+
void register_hook(const std::string &name, HookCallback callback);
40+
41+
/**
42+
* @brief Call hook if registered
43+
*
44+
* @param name Full hook name
45+
* @param tensor Tensor to pass to hook
46+
* @param layer_idx Layer index (-1 if not applicable)
47+
*/
48+
void call_hook(const std::string &name, const infinicore::Tensor &tensor, int layer_idx = -1) const;
49+
50+
/**
51+
* @brief Clear all hooks
52+
*/
53+
void clear();
54+
55+
/**
56+
* @brief Check if any hooks are registered
57+
*/
58+
bool has_hooks() const { return !hooks_.empty(); }
59+
60+
private:
61+
std::unordered_map<std::string, HookCallback> hooks_;
62+
};
63+
64+
/**
65+
* @brief Macro to simplify hook registration (DEBUG ONLY)
66+
*
67+
* Usage: REGISTER_HOOK(registry, "hook_name", callback)
68+
*/
69+
#define REGISTER_HOOK(registry, name, callback) \
70+
(registry)->register_hook(name, callback)
71+
72+
/**
73+
* @brief Macro to simplify hook calls with automatic null and has_hooks checks (DEBUG ONLY)
74+
*
75+
* Usage: CALL_HOOK(registry, "hook_name", tensor)
76+
* Note: layer_idx defaults to -1
77+
*/
78+
#define CALL_HOOK(registry, name, tensor) \
79+
do { \
80+
if ((registry) && (registry)->has_hooks()) { \
81+
(registry)->call_hook(name, tensor, -1); \
82+
} \
83+
} while (0)
84+
85+
/**
86+
* @brief Macro to simplify hook calls with explicit layer index (DEBUG ONLY)
87+
*
88+
* Usage: CALL_HOOK_LAYER(registry, "hook_name", tensor, layer_idx)
89+
*/
90+
#define CALL_HOOK_LAYER(registry, name, tensor, layer_idx) \
91+
do { \
92+
if ((registry) && (registry)->has_hooks()) { \
93+
(registry)->call_hook(name, tensor, layer_idx); \
94+
} \
95+
} while (0)
96+
97+
/**
98+
* @brief Macros to simplify hook_registry and hook_prefix management in model classes
99+
*/
100+
101+
// Declare hook_registry and hook_prefix member variables
102+
#define HOOK_REGISTRY_MEMBER() \
103+
std::shared_ptr<debug_utils::HookRegistry> hook_registry_; \
104+
std::string hook_prefix_;
105+
106+
// Set hook_registry and hook_prefix (no forwarding to submodules)
107+
#define SET_HOOK_REGISTRY_SIMPLE() \
108+
void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
109+
hook_registry_ = hook_registry; \
110+
hook_prefix_ = hook_prefix; \
111+
}
112+
113+
// Helper macro to build incremental hook prefix
114+
#define BUILD_HOOK_PREFIX(prefix, name) \
115+
(prefix.empty() ? std::string(name) : prefix + "_" + std::string(name))
116+
117+
// Set hook_registry and hook_prefix and forward to one or more submodules
118+
// Usage: SET_HOOK_REGISTRY(submodule1) or SET_HOOK_REGISTRY(submodule1, submodule2)
119+
// The hook_prefix will be incremented for each submodule (e.g., "layer0" -> "layer0_attention")
120+
// Note: Currently supports up to 2 submodules. For more, extend the pattern below.
121+
#define SET_HOOK_REGISTRY(...) \
122+
SET_HOOK_REGISTRY_IMPL(__VA_ARGS__)
123+
124+
// Helper to handle variable number of arguments using a reliable pattern
125+
#define SET_HOOK_REGISTRY_IMPL(...) \
126+
SET_HOOK_REGISTRY_GET_NTH(__VA_ARGS__, SET_HOOK_REGISTRY_2, SET_HOOK_REGISTRY_1, SET_HOOK_REGISTRY_0,)(__VA_ARGS__)
127+
128+
// Get the selector based on argument count
129+
// Pattern: when we have N args, the (N+1)th parameter from the end is the selector
130+
// For 0 args: _1=SET_HOOK_REGISTRY_2, _2=SET_HOOK_REGISTRY_1, _3=SET_HOOK_REGISTRY_0, N=(empty) → need to use _3
131+
// For 1 arg: _1=arg, _2=SET_HOOK_REGISTRY_2, _3=SET_HOOK_REGISTRY_1, N=SET_HOOK_REGISTRY_0 → wrong, need _3
132+
// For 2 args: _1=arg1, _2=arg2, _3=SET_HOOK_REGISTRY_2, N=SET_HOOK_REGISTRY_1 → wrong, need _3
133+
134+
// Use _3 as the selector (it's in the right position for all cases)
135+
#define SET_HOOK_REGISTRY_GET_NTH(_1, _2, _3, N, ...) _3
136+
137+
// Implementation for 0 args (shouldn't be used, but handle gracefully)
138+
#define SET_HOOK_REGISTRY_0() \
139+
void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
140+
hook_registry_ = hook_registry; \
141+
hook_prefix_ = hook_prefix; \
142+
}
143+
144+
// Implementation for 1 arg
145+
#define SET_HOOK_REGISTRY_1(submodule) \
146+
void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
147+
hook_registry_ = hook_registry; \
148+
hook_prefix_ = hook_prefix; \
149+
if (submodule##_) { \
150+
std::string submodule_prefix = BUILD_HOOK_PREFIX(hook_prefix, #submodule); \
151+
submodule##_->set_hook_registry(hook_registry, submodule_prefix); \
152+
} \
153+
}
154+
155+
// Implementation for 2 args
156+
#define SET_HOOK_REGISTRY_2(submodule1, submodule2) \
157+
void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
158+
hook_registry_ = hook_registry; \
159+
hook_prefix_ = hook_prefix; \
160+
if (submodule1##_) { \
161+
std::string submodule1_prefix = BUILD_HOOK_PREFIX(hook_prefix, #submodule1); \
162+
submodule1##_->set_hook_registry(hook_registry, submodule1_prefix); \
163+
} \
164+
if (submodule2##_) { \
165+
std::string submodule2_prefix = BUILD_HOOK_PREFIX(hook_prefix, #submodule2); \
166+
submodule2##_->set_hook_registry(hook_registry, submodule2_prefix); \
167+
} \
168+
}
169+
170+
// Set hook_registry and hook_prefix for a vector of submodules
171+
// For vectors, the prefix is incremented with an index (e.g., "layer0", "layer1", ...)
172+
// If parent has a prefix, it becomes "parent_layer0", "parent_layer1", etc.
173+
#define SET_HOOK_REGISTRY_VEC(vec_name) \
174+
void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
175+
hook_registry_ = hook_registry; \
176+
hook_prefix_ = hook_prefix; \
177+
for (size_t i = 0; i < vec_name##_.size(); ++i) { \
178+
if (vec_name##_[i]) { \
179+
std::string layer_name = "layer" + std::to_string(i); \
180+
std::string item_prefix = BUILD_HOOK_PREFIX(hook_prefix, layer_name); \
181+
vec_name##_[i]->set_hook_registry(hook_registry, item_prefix); \
182+
} \
183+
} \
184+
}
185+
186+
} // namespace infinilm::models::debug_utils

0 commit comments

Comments
 (0)