@@ -343,13 +343,6 @@ class CLIPTokenizer {
343343 }
344344 }
345345
346- std::string clean_up_tokenization (std::string& text) {
347- std::regex pattern (R"( ,)" );
348- // Replace " ," with ","
349- std::string result = std::regex_replace (text, pattern, " ," );
350- return result;
351- }
352-
353346 std::string decode (const std::vector<int >& tokens) {
354347 std::string text = " " ;
355348 for (int t : tokens) {
@@ -358,12 +351,8 @@ class CLIPTokenizer {
358351 std::u32string ts = decoder[t];
359352 // printf("%d, %s \n", t, utf32_to_utf8(ts).c_str());
360353 std::string s = utf32_to_utf8 (ts);
361- if (s.length () >= 4 ) {
362- if (ends_with (s, " </w>" )) {
363- text += s.replace (s.length () - 4 , s.length () - 1 , " " ) + " " ;
364- } else {
365- text += s;
366- }
354+ if (s.length () >= 4 && ends_with (s, " </w>" )) {
355+ text += " " + s.replace (s.length () - 4 , s.length () - 1 , " " );
367356 } else {
368357 text += " " + s;
369358 }
@@ -375,7 +364,6 @@ class CLIPTokenizer {
375364
376365 // std::string s((char *)bytes.data());
377366 // std::string s = "";
378- text = clean_up_tokenization (text);
379367 return trim (text);
380368 }
381369
@@ -545,12 +533,9 @@ class CLIPEmbeddings : public GGMLBlock {
545533 int64_t vocab_size;
546534 int64_t num_positions;
547535
548- void init_params (struct ggml_context * ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = " " ) {
549- enum ggml_type token_wtype = GGML_TYPE_F32; // (tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
550- enum ggml_type position_wtype = GGML_TYPE_F32; // (tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
551-
552- params[" token_embedding.weight" ] = ggml_new_tensor_2d (ctx, token_wtype, embed_dim, vocab_size);
553- params[" position_embedding.weight" ] = ggml_new_tensor_2d (ctx, position_wtype, embed_dim, num_positions);
536+ void init_params (struct ggml_context * ctx, ggml_type wtype) {
537+ params[" token_embedding.weight" ] = ggml_new_tensor_2d (ctx, wtype, embed_dim, vocab_size);
538+ params[" position_embedding.weight" ] = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, embed_dim, num_positions);
554539 }
555540
556541public:
@@ -594,14 +579,11 @@ class CLIPVisionEmbeddings : public GGMLBlock {
594579 int64_t image_size;
595580 int64_t num_patches;
596581 int64_t num_positions;
597- void init_params (struct ggml_context * ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = " " ) {
598- enum ggml_type patch_wtype = GGML_TYPE_F16; // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16;
599- enum ggml_type class_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32;
600- enum ggml_type position_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
601582
602- params[" patch_embedding.weight" ] = ggml_new_tensor_4d (ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
603- params[" class_embedding" ] = ggml_new_tensor_1d (ctx, class_wtype, embed_dim);
604- params[" position_embedding.weight" ] = ggml_new_tensor_2d (ctx, position_wtype, embed_dim, num_positions);
583+ void init_params (struct ggml_context * ctx, ggml_type wtype) {
584+ params[" patch_embedding.weight" ] = ggml_new_tensor_4d (ctx, GGML_TYPE_F16, patch_size, patch_size, num_channels, embed_dim);
585+ params[" class_embedding" ] = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, embed_dim);
586+ params[" position_embedding.weight" ] = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, embed_dim, num_positions);
605587 }
606588
607589public:
@@ -657,11 +639,9 @@ enum CLIPVersion {
657639
658640class CLIPTextModel : public GGMLBlock {
659641protected:
660- void init_params (struct ggml_context * ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = " " ) {
642+ void init_params (struct ggml_context * ctx, ggml_type wtype ) {
661643 if (version == OPEN_CLIP_VIT_BIGG_14) {
662- enum ggml_type wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32;
663- params[" text_projection" ] = ggml_new_tensor_2d (ctx, wtype, projection_dim, hidden_size);
664- ggml_set_name (params[" text_projection" ], (prefix + " text_projection" ).c_str ());
644+ params[" text_projection" ] = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, projection_dim, hidden_size);
665645 }
666646 }
667647
@@ -785,17 +765,14 @@ class CLIPVisionModel : public GGMLBlock {
785765 auto x = embeddings->forward (ctx, pixel_values); // [N, num_positions, embed_dim]
786766 x = pre_layernorm->forward (ctx, x);
787767 x = encoder->forward (ctx, x, -1 , false );
788- // print_ggml_tensor(x, true, "ClipVisionModel x: ");
789- auto last_hidden_state = x;
790- x = post_layernorm->forward (ctx, x); // [N, n_token, hidden_size]
768+ x = post_layernorm->forward (ctx, x); // [N, n_token, hidden_size]
791769
792770 GGML_ASSERT (x->ne [3 ] == 1 );
793771 if (return_pooled) {
794772 ggml_tensor* pooled = ggml_cont (ctx, ggml_view_2d (ctx, x, x->ne [0 ], x->ne [2 ], x->nb [2 ], 0 ));
795773 return pooled; // [N, hidden_size]
796774 } else {
797- // return x; // [N, n_token, hidden_size]
798- return last_hidden_state; // [N, n_token, hidden_size]
775+ return x; // [N, n_token, hidden_size]
799776 }
800777 }
801778};
@@ -806,14 +783,13 @@ class CLIPProjection : public UnaryBlock {
806783 int64_t out_features;
807784 bool transpose_weight;
808785
809- void init_params (struct ggml_context * ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = " " ) {
810- enum ggml_type wtype = tensor_types.find (prefix + " weight" ) != tensor_types.end () ? tensor_types[prefix + " weight" ] : GGML_TYPE_F32;
786+ void init_params (struct ggml_context * ctx, ggml_type wtype) {
811787 if (transpose_weight) {
788+ LOG_ERROR (" transpose_weight" );
812789 params[" weight" ] = ggml_new_tensor_2d (ctx, wtype, out_features, in_features);
813790 } else {
814791 params[" weight" ] = ggml_new_tensor_2d (ctx, wtype, in_features, out_features);
815792 }
816- ggml_set_name (params[" weight" ], (prefix + " weight" ).c_str ());
817793 }
818794
819795public:
@@ -870,13 +846,12 @@ struct CLIPTextModelRunner : public GGMLRunner {
870846 CLIPTextModel model;
871847
872848 CLIPTextModelRunner (ggml_backend_t backend,
873- std::map<std::string, enum ggml_type>& tensor_types,
874- const std::string prefix,
849+ ggml_type wtype,
875850 CLIPVersion version = OPENAI_CLIP_VIT_L_14,
876851 int clip_skip_value = 1 ,
877852 bool with_final_ln = true )
878- : GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) {
879- model.init (params_ctx, tensor_types, prefix );
853+ : GGMLRunner(backend, wtype ), model(version, clip_skip_value, with_final_ln) {
854+ model.init (params_ctx, wtype );
880855 }
881856
882857 std::string get_desc () {
@@ -918,13 +893,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
918893 struct ggml_tensor * embeddings = NULL ;
919894
920895 if (num_custom_embeddings > 0 && custom_embeddings_data != NULL ) {
921- auto token_embed_weight = model.get_token_embed_weight ();
922- auto custom_embeddings = ggml_new_tensor_2d (compute_ctx,
923- token_embed_weight->type ,
924- model.hidden_size ,
925- num_custom_embeddings);
896+ auto custom_embeddings = ggml_new_tensor_2d (compute_ctx,
897+ wtype,
898+ model.hidden_size ,
899+ num_custom_embeddings);
926900 set_backend_tensor_data (custom_embeddings, custom_embeddings_data);
927901
902+ auto token_embed_weight = model.get_token_embed_weight ();
928903 // concatenate custom embeddings
929904 embeddings = ggml_concat (compute_ctx, token_embed_weight, custom_embeddings, 1 );
930905 }
@@ -936,7 +911,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
936911 return gf;
937912 }
938913
939- bool compute (const int n_threads,
914+ void compute (const int n_threads,
940915 struct ggml_tensor * input_ids,
941916 int num_custom_embeddings,
942917 void * custom_embeddings_data,
@@ -947,7 +922,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
947922 auto get_graph = [&]() -> struct ggml_cgraph * {
948923 return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
949924 };
950- return GGMLRunner::compute (get_graph, n_threads, true , output, output_ctx);
925+ GGMLRunner::compute (get_graph, n_threads, true , output, output_ctx);
951926 }
952927};
953928
0 commit comments