Skip to content

Commit f14e2d8

Browse files
committed
fix: restore upstream clip & conditioner after bad merge
1 parent 22b63da commit f14e2d8

File tree

2 files changed

+165
-574
lines changed

2 files changed

+165
-574
lines changed

clip.hpp

Lines changed: 25 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -343,13 +343,6 @@ class CLIPTokenizer {
343343
}
344344
}
345345

346-
std::string clean_up_tokenization(std::string& text) {
347-
std::regex pattern(R"( ,)");
348-
// Replace " ," with ","
349-
std::string result = std::regex_replace(text, pattern, ",");
350-
return result;
351-
}
352-
353346
std::string decode(const std::vector<int>& tokens) {
354347
std::string text = "";
355348
for (int t : tokens) {
@@ -358,12 +351,8 @@ class CLIPTokenizer {
358351
std::u32string ts = decoder[t];
359352
// printf("%d, %s \n", t, utf32_to_utf8(ts).c_str());
360353
std::string s = utf32_to_utf8(ts);
361-
if (s.length() >= 4) {
362-
if (ends_with(s, "</w>")) {
363-
text += s.replace(s.length() - 4, s.length() - 1, "") + " ";
364-
} else {
365-
text += s;
366-
}
354+
if (s.length() >= 4 && ends_with(s, "</w>")) {
355+
text += " " + s.replace(s.length() - 4, s.length() - 1, "");
367356
} else {
368357
text += " " + s;
369358
}
@@ -375,7 +364,6 @@ class CLIPTokenizer {
375364

376365
// std::string s((char *)bytes.data());
377366
// std::string s = "";
378-
text = clean_up_tokenization(text);
379367
return trim(text);
380368
}
381369

@@ -545,12 +533,9 @@ class CLIPEmbeddings : public GGMLBlock {
545533
int64_t vocab_size;
546534
int64_t num_positions;
547535

548-
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
549-
enum ggml_type token_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
550-
enum ggml_type position_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
551-
552-
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
553-
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
536+
void init_params(struct ggml_context* ctx, ggml_type wtype) {
537+
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, wtype, embed_dim, vocab_size);
538+
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, num_positions);
554539
}
555540

556541
public:
@@ -594,14 +579,11 @@ class CLIPVisionEmbeddings : public GGMLBlock {
594579
int64_t image_size;
595580
int64_t num_patches;
596581
int64_t num_positions;
597-
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
598-
enum ggml_type patch_wtype = GGML_TYPE_F16; // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16;
599-
enum ggml_type class_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32;
600-
enum ggml_type position_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
601582

602-
params["patch_embedding.weight"] = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
603-
params["class_embedding"] = ggml_new_tensor_1d(ctx, class_wtype, embed_dim);
604-
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
583+
void init_params(struct ggml_context* ctx, ggml_type wtype) {
584+
params["patch_embedding.weight"] = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, patch_size, patch_size, num_channels, embed_dim);
585+
params["class_embedding"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, embed_dim);
586+
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, num_positions);
605587
}
606588

607589
public:
@@ -657,11 +639,9 @@ enum CLIPVersion {
657639

658640
class CLIPTextModel : public GGMLBlock {
659641
protected:
660-
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
642+
void init_params(struct ggml_context* ctx, ggml_type wtype) {
661643
if (version == OPEN_CLIP_VIT_BIGG_14) {
662-
enum ggml_type wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32;
663-
params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
664-
ggml_set_name(params["text_projection"], (prefix + "text_projection").c_str());
644+
params["text_projection"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, projection_dim, hidden_size);
665645
}
666646
}
667647

@@ -785,17 +765,14 @@ class CLIPVisionModel : public GGMLBlock {
785765
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
786766
x = pre_layernorm->forward(ctx, x);
787767
x = encoder->forward(ctx, x, -1, false);
788-
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
789-
auto last_hidden_state = x;
790-
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
768+
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
791769

792770
GGML_ASSERT(x->ne[3] == 1);
793771
if (return_pooled) {
794772
ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
795773
return pooled; // [N, hidden_size]
796774
} else {
797-
// return x; // [N, n_token, hidden_size]
798-
return last_hidden_state; // [N, n_token, hidden_size]
775+
return x; // [N, n_token, hidden_size]
799776
}
800777
}
801778
};
@@ -806,14 +783,13 @@ class CLIPProjection : public UnaryBlock {
806783
int64_t out_features;
807784
bool transpose_weight;
808785

809-
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
810-
enum ggml_type wtype = tensor_types.find(prefix + "weight") != tensor_types.end() ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
786+
void init_params(struct ggml_context* ctx, ggml_type wtype) {
811787
if (transpose_weight) {
788+
LOG_ERROR("transpose_weight");
812789
params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
813790
} else {
814791
params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
815792
}
816-
ggml_set_name(params["weight"], (prefix + "weight").c_str());
817793
}
818794

819795
public:
@@ -870,13 +846,12 @@ struct CLIPTextModelRunner : public GGMLRunner {
870846
CLIPTextModel model;
871847

872848
CLIPTextModelRunner(ggml_backend_t backend,
873-
std::map<std::string, enum ggml_type>& tensor_types,
874-
const std::string prefix,
849+
ggml_type wtype,
875850
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
876851
int clip_skip_value = 1,
877852
bool with_final_ln = true)
878-
: GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) {
879-
model.init(params_ctx, tensor_types, prefix);
853+
: GGMLRunner(backend, wtype), model(version, clip_skip_value, with_final_ln) {
854+
model.init(params_ctx, wtype);
880855
}
881856

882857
std::string get_desc() {
@@ -918,13 +893,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
918893
struct ggml_tensor* embeddings = NULL;
919894

920895
if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) {
921-
auto token_embed_weight = model.get_token_embed_weight();
922-
auto custom_embeddings = ggml_new_tensor_2d(compute_ctx,
923-
token_embed_weight->type,
924-
model.hidden_size,
925-
num_custom_embeddings);
896+
auto custom_embeddings = ggml_new_tensor_2d(compute_ctx,
897+
wtype,
898+
model.hidden_size,
899+
num_custom_embeddings);
926900
set_backend_tensor_data(custom_embeddings, custom_embeddings_data);
927901

902+
auto token_embed_weight = model.get_token_embed_weight();
928903
// concatenate custom embeddings
929904
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
930905
}
@@ -936,7 +911,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
936911
return gf;
937912
}
938913

939-
bool compute(const int n_threads,
914+
void compute(const int n_threads,
940915
struct ggml_tensor* input_ids,
941916
int num_custom_embeddings,
942917
void* custom_embeddings_data,
@@ -947,7 +922,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
947922
auto get_graph = [&]() -> struct ggml_cgraph* {
948923
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
949924
};
950-
return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
925+
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
951926
}
952927
};
953928

0 commit comments

Comments
 (0)