[fix] Fix cosyvoice Deinit bug

LittleMouse · LittleMouse · commit bd4c03e9a4d4 · 2025-11-21T14:55:09.000+08:00
diff --git a/projects/llm_framework/main_cosy_voice/src/main.cpp b/projects/llm_framework/main_cosy_voice/src/main.cpp
@@ -65,7 +65,6 @@ class llm_task {
     std::atomic<bool> g_llm_finished{false};
     std::atomic<bool> g_stop{false};
     TokenBuffer g_token_buffer;
-    Token2Wav lToken2Wav;
 
     std::vector<int> prompt_text_token;
     std::vector<unsigned short> prompt_text_embeds;
@@ -81,6 +80,7 @@ class llm_task {
     LLMAttrType mode_config_;
     Token2WavAttr infer_mode_config_;
     std::unique_ptr<LLM> lLaMa_;
+    std::unique_ptr<Token2Wav> lToken2Wav_;
     std::string model_;
     std::string response_format_;
     std::vector<std::string> inputs_;
@@ -312,14 +312,15 @@ class llm_task {
                 lLaMa_.reset();
                 return -2;
             }
-            if (!lToken2Wav.Init(infer_mode_config_)) {
+            lToken2Wav_ = std::make_unique<Token2Wav>();
+            if (!lToken2Wav_->Init(infer_mode_config_)) {
                 lLaMa_->Deinit();
                 lLaMa_.reset();
                 return -1;
             }
             lLaMa_->TextToken2Embeds(prompt_text_token, prompt_text_embeds);
             lLaMa_->SpeechToken2Embeds(prompt_speech_token, prompt_speech_embeds);
-            lToken2Wav.SpeechToken2Embeds(prompt_speech_token, prompt_speech_embeds_flow);
+            lToken2Wav_->SpeechToken2Embeds(prompt_speech_token, prompt_speech_embeds_flow);
 
         } catch (...) {
             SLOGE("config false");
@@ -358,7 +359,7 @@ class llm_task {
     {
         g_llm_finished = false;
         g_token_buffer.erase(g_token_buffer.begin(), g_token_buffer.end());
-        lToken2Wav.reset();
+        lToken2Wav_->clear();
     }
 
     void resample_audio(float *input_buffer, int input_length, float *output_buffer, int *output_length,
@@ -428,7 +429,7 @@ class llm_task {
                 }
             }
 
-            int prompt_token_len = prompt_speech_embeds_flow.size() / lToken2Wav._attr.flow_embed_size;
+            int prompt_token_len = prompt_speech_embeds_flow.size() / lToken2Wav_->_attr.flow_embed_size;
             if (prompt_token_len < 75) {
                 SLOGE("Error, prompt speech token len %d < 75", prompt_token_len);
                 if (llm_thread.joinable()) llm_thread.join();
@@ -450,28 +451,28 @@ class llm_task {
             int token_offset = 0;
             int i            = 0;
             while (true) {
-                this_token_hop_len = (token_offset == 0) ? lToken2Wav._attr.token_hop_len + promot_token_pad
-                                                         : lToken2Wav._attr.token_hop_len;
+                this_token_hop_len = (token_offset == 0) ? lToken2Wav_->_attr.token_hop_len + promot_token_pad
+                                                         : lToken2Wav_->_attr.token_hop_len;
                 std::unique_lock<std::mutex> lock(g_buffer_mutex);
                 g_buffer_cv.wait(lock, [&] {
                     return (g_token_buffer.size() - token_offset >=
-                            this_token_hop_len + lToken2Wav._attr.pre_lookahead_len) ||
+                            this_token_hop_len + lToken2Wav_->_attr.pre_lookahead_len) ||
                            g_llm_finished.load() || g_stop.load();
                 });
                 if (g_stop) {
                     lock.unlock();
                     break;
                 } else if (g_token_buffer.size() - token_offset >=
-                           this_token_hop_len + lToken2Wav._attr.pre_lookahead_len) {
+                           this_token_hop_len + lToken2Wav_->_attr.pre_lookahead_len) {
                     std::vector<SpeechToken> token;
-                    int start = token_offset - std::min(int(token_offset / lToken2Wav._attr.token_hop_len),
-                                                        lToken2Wav._attr.max_infer_chunk_num - 1) *
-                                                   lToken2Wav._attr.token_hop_len;
-                    int end = token_offset + this_token_hop_len + lToken2Wav._attr.pre_lookahead_len;
+                    int start = token_offset - std::min(int(token_offset / lToken2Wav_->_attr.token_hop_len),
+                                                        lToken2Wav_->_attr.max_infer_chunk_num - 1) *
+                                                   lToken2Wav_->_attr.token_hop_len;
+                    int end = token_offset + this_token_hop_len + lToken2Wav_->_attr.pre_lookahead_len;
                     token.insert(token.end(), g_token_buffer.begin() + start, g_token_buffer.begin() + end);
                     lock.unlock();
-                    auto speech = lToken2Wav.infer(token, prompt_speech_embeds_flow1, prompt_feat1, spk_embeds,
-                                                   token_offset, false);
+                    auto speech = lToken2Wav_->infer(token, prompt_speech_embeds_flow1, prompt_feat1, spk_embeds,
+                                                     token_offset, false);
                     token_offset += this_token_hop_len;
                     output.insert(output.end(), speech.begin(), speech.end());
                     double src_ratio =
@@ -507,12 +508,12 @@ class llm_task {
             }
 
             std::vector<SpeechToken> token;
-            int start = g_token_buffer.size() - std::min(int(g_token_buffer.size() / lToken2Wav._attr.token_hop_len),
-                                                         lToken2Wav._attr.max_infer_chunk_num - 1) *
-                                                    lToken2Wav._attr.token_hop_len;
+            int start = g_token_buffer.size() - std::min(int(g_token_buffer.size() / lToken2Wav_->_attr.token_hop_len),
+                                                         lToken2Wav_->_attr.max_infer_chunk_num - 1) *
+                                                    lToken2Wav_->_attr.token_hop_len;
             token.insert(token.end(), g_token_buffer.begin() + start, g_token_buffer.end());
-            auto speech = lToken2Wav.infer(token, prompt_speech_embeds_flow1, prompt_feat1, spk_embeds,
-                                           token_offset - start, true);
+            auto speech = lToken2Wav_->infer(token, prompt_speech_embeds_flow1, prompt_feat1, spk_embeds,
+                                             token_offset - start, true);
             output.insert(output.end(), speech.begin(), speech.end());
             double src_ratio =
                 static_cast<double>(mode_config_.audio_rate) / static_cast<double>(mode_config_.mode_rate);
@@ -662,6 +663,9 @@ class llm_task {
         if (lLaMa_) {
             lLaMa_->Deinit();
         }
+        if (lToken2Wav_) {
+            lToken2Wav_->Deinit();
+        }
     }
 };
 
diff --git a/projects/llm_framework/main_cosy_voice/src/runner/Token2wav.hpp b/projects/llm_framework/main_cosy_voice/src/runner/Token2wav.hpp
@@ -464,7 +464,7 @@ class Token2Wav {
         // fade_in_mel_data is now modified in-place with the faded result.
     }
 
-    void reset()
+    void clear()
     {
         std::unordered_map<std::string, std::vector<float>>().swap(hift_cache_dict);
     }
diff --git a/projects/llm_framework/main_llm/src/main.cpp b/projects/llm_framework/main_llm/src/main.cpp
@@ -404,6 +404,7 @@ class llm_task {
     bool pause()
     {
         if (lLaMa_) lLaMa_->Stop();
+        if (lLaMa_ctx_) lLaMa_ctx_->Stop();
         return true;
     }
 
@@ -414,8 +415,10 @@ class llm_task {
             waitpid(tokenizer_pid_, nullptr, 0);
             tokenizer_pid_ = -1;
         }
-        lLaMa_->Deinit();
-        lLaMa_.reset();
+        if (lLaMa_) lLaMa_->Deinit();
+        if (lLaMa_) lLaMa_.reset();
+        if (lLaMa_ctx_) lLaMa_ctx_->Deinit();
+        if (lLaMa_ctx_) lLaMa_ctx_.reset();
         return true;
     }
 
@@ -447,6 +450,7 @@ class llm_task {
             std::string par;
             async_list_.put(par);
             if (lLaMa_) lLaMa_->Stop();
+            if (lLaMa_ctx_) lLaMa_ctx_->Stop();
             inference_run_->join();
             inference_run_.reset();
         }
@@ -462,6 +466,9 @@ class llm_task {
         if (lLaMa_) {
             lLaMa_->Deinit();
         }
+        if (lLaMa_ctx_) {
+            lLaMa_ctx_->Deinit();
+        }
     }
 };
 
@@ -514,7 +521,8 @@ class llm_llm : public StackFlow {
         if (!(llm_task_obj && llm_channel)) {
             return;
         }
-        llm_task_obj->lLaMa_->Stop();
+        if (llm_task_obj->lLaMa_) llm_task_obj->lLaMa_->Stop();
+        if (llm_task_obj->lLaMa_ctx_) llm_task_obj->lLaMa_ctx_->Stop();
     }
 
     void pause(const std::string &work_id, const std::string &object, const std::string &data) override
@@ -605,7 +613,8 @@ class llm_llm : public StackFlow {
         if (!(llm_task_obj && llm_channel)) {
             return;
         }
-        llm_task_obj->lLaMa_->Stop();
+        if (llm_task_obj->lLaMa_) llm_task_obj->lLaMa_->Stop();
+        if (llm_task_obj->lLaMa_ctx_) llm_task_obj->lLaMa_ctx_->Stop();
     }
 
     int setup(const std::string &work_id, const std::string &object, const std::string &data) override
diff --git a/projects/llm_framework/main_openai_api/SConstruct b/projects/llm_framework/main_openai_api/SConstruct
@@ -18,7 +18,7 @@ LINK_SEARCH_PATH = []
 STATIC_FILES = []
 
 
-ModuleLLMOpenAIPluginPath = wget_github_commit('https://github.com/m5stack/ModuleLLM-OpenAI-Plugin.git', '5298be215735f5b1c21bc9225c38d7cb9c1933db', True)
+ModuleLLMOpenAIPluginPath = wget_github_commit('https://github.com/m5stack/ModuleLLM-OpenAI-Plugin.git', 'a8f54b0430c478896b45828f612d0d8b0a6f2fa1', True)
 python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_llm-openai-api-python-venv_v1.6.tar.gz", 'm5stack_llm-openai-api-python-venv_v1.6.tar.gz')
 
 
@@ -52,7 +52,7 @@ ignore['ignore'] = list(set(ignore['ignore']))
 with open('../dist/fileignore', 'w') as f:
     json.dump(ignore, f, indent=4)
 
-env['COMPONENTS'].append({'target':'llm_openai_api-1.8',
+env['COMPONENTS'].append({'target':'llm_openai_api-1.9',
                           'SRCS':SRCS,
                           'INCLUDE':INCLUDE,
                           'PRIVATE_INCLUDE':PRIVATE_INCLUDE,
diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py
@@ -387,7 +387,7 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', dep
         'llm-depth-anything':[create_bin_deb,'llm-depth-anything', '1.7', src_folder, revision],
         'llm-vad':[create_bin_deb,'llm-vad', '1.8', src_folder, revision],
         'llm-whisper':[create_bin_deb,'llm-whisper', '1.8', src_folder, revision],
-        'llm-openai-api':[create_bin_deb,'llm-openai-api', '1.8', src_folder, revision],
+        'llm-openai-api':[create_bin_deb,'llm-openai-api', '1.9', src_folder, revision],
         'llm-cosy-voice':[create_bin_deb,'llm-cosy-voice', '1.8', src_folder, revision],
         # keyword spotting Audio file
         'llm-model-audio-en-us':[create_data_deb,'llm-model-audio-en-us', data_version, src_folder, revision],

Original file line number	Diff line number	Diff line change
`@@ -464,7 +464,7 @@ class Token2Wav {`
`464`	`464`	`// fade_in_mel_data is now modified in-place with the faded result.`
`465`	`465`	`}`
`466`	`466`
`467`		`- void reset()`
	`467`	`+ void clear()`
`468`	`468`	`{`
`469`	`469`	`std::unordered_map<std::string, std::vector<float>>().swap(hift_cache_dict);`
`470`	`470`	`}`
Original file line number	Diff line number	Diff line change
`@@ -404,6 +404,7 @@ class llm_task {`
`404`	`404`	`bool pause()`
`405`	`405`	`{`
`406`	`406`	`if (lLaMa_) lLaMa_->Stop();`
	`407`	`+ if (lLaMa_ctx_) lLaMa_ctx_->Stop();`
`407`	`408`	`return true;`
`408`	`409`	`}`
`409`	`410`
`@@ -414,8 +415,10 @@ class llm_task {`
`414`	`415`	`waitpid(tokenizer_pid_, nullptr, 0);`
`415`	`416`	`tokenizer_pid_ = -1;`
`416`	`417`	`}`
`417`		`- lLaMa_->Deinit();`
`418`		`- lLaMa_.reset();`
	`418`	`+ if (lLaMa_) lLaMa_->Deinit();`
	`419`	`+ if (lLaMa_) lLaMa_.reset();`
	`420`	`+ if (lLaMa_ctx_) lLaMa_ctx_->Deinit();`
	`421`	`+ if (lLaMa_ctx_) lLaMa_ctx_.reset();`
`419`	`422`	`return true;`
`420`	`423`	`}`
`421`	`424`
`@@ -447,6 +450,7 @@ class llm_task {`
`447`	`450`	`std::string par;`
`448`	`451`	`async_list_.put(par);`
`449`	`452`	`if (lLaMa_) lLaMa_->Stop();`
	`453`	`+ if (lLaMa_ctx_) lLaMa_ctx_->Stop();`
`450`	`454`	`inference_run_->join();`
`451`	`455`	`inference_run_.reset();`
`452`	`456`	`}`
`@@ -462,6 +466,9 @@ class llm_task {`
`462`	`466`	`if (lLaMa_) {`
`463`	`467`	`lLaMa_->Deinit();`
`464`	`468`	`}`
	`469`	`+ if (lLaMa_ctx_) {`
	`470`	`+ lLaMa_ctx_->Deinit();`
	`471`	`+ }`
`465`	`472`	`}`
`466`	`473`	`};`
`467`	`474`
`@@ -514,7 +521,8 @@ class llm_llm : public StackFlow {`
`514`	`521`	`if (!(llm_task_obj && llm_channel)) {`
`515`	`522`	`return;`
`516`	`523`	`}`
`517`		`- llm_task_obj->lLaMa_->Stop();`
	`524`	`+ if (llm_task_obj->lLaMa_) llm_task_obj->lLaMa_->Stop();`
	`525`	`+ if (llm_task_obj->lLaMa_ctx_) llm_task_obj->lLaMa_ctx_->Stop();`
`518`	`526`	`}`
`519`	`527`
`520`	`528`	`void pause(const std::string &work_id, const std::string &object, const std::string &data) override`
`@@ -605,7 +613,8 @@ class llm_llm : public StackFlow {`
`605`	`613`	`if (!(llm_task_obj && llm_channel)) {`
`606`	`614`	`return;`
`607`	`615`	`}`
`608`		`- llm_task_obj->lLaMa_->Stop();`
	`616`	`+ if (llm_task_obj->lLaMa_) llm_task_obj->lLaMa_->Stop();`
	`617`	`+ if (llm_task_obj->lLaMa_ctx_) llm_task_obj->lLaMa_ctx_->Stop();`
`609`	`618`	`}`
`610`	`619`
`611`	`620`	`int setup(const std::string &work_id, const std::string &object, const std::string &data) override`