@@ -65,7 +65,6 @@ class llm_task {
6565 std::atomic<bool > g_llm_finished{false };
6666 std::atomic<bool > g_stop{false };
6767 TokenBuffer g_token_buffer;
68- Token2Wav lToken2Wav;
6968
7069 std::vector<int > prompt_text_token;
7170 std::vector<unsigned short > prompt_text_embeds;
@@ -81,6 +80,7 @@ class llm_task {
8180 LLMAttrType mode_config_;
8281 Token2WavAttr infer_mode_config_;
8382 std::unique_ptr<LLM> lLaMa_;
83+ std::unique_ptr<Token2Wav> lToken2Wav_;
8484 std::string model_;
8585 std::string response_format_;
8686 std::vector<std::string> inputs_;
@@ -312,14 +312,15 @@ class llm_task {
312312 lLaMa_.reset ();
313313 return -2 ;
314314 }
315- if (!lToken2Wav.Init (infer_mode_config_)) {
315+ lToken2Wav_ = std::make_unique<Token2Wav>();
316+ if (!lToken2Wav_->Init (infer_mode_config_)) {
316317 lLaMa_->Deinit ();
317318 lLaMa_.reset ();
318319 return -1 ;
319320 }
320321 lLaMa_->TextToken2Embeds (prompt_text_token, prompt_text_embeds);
321322 lLaMa_->SpeechToken2Embeds (prompt_speech_token, prompt_speech_embeds);
322- lToken2Wav. SpeechToken2Embeds (prompt_speech_token, prompt_speech_embeds_flow);
323+ lToken2Wav_-> SpeechToken2Embeds (prompt_speech_token, prompt_speech_embeds_flow);
323324
324325 } catch (...) {
325326 SLOGE (" config false" );
@@ -358,7 +359,7 @@ class llm_task {
358359 {
359360 g_llm_finished = false ;
360361 g_token_buffer.erase (g_token_buffer.begin (), g_token_buffer.end ());
361- lToken2Wav. reset ();
362+ lToken2Wav_-> clear ();
362363 }
363364
364365 void resample_audio (float *input_buffer, int input_length, float *output_buffer, int *output_length,
@@ -428,7 +429,7 @@ class llm_task {
428429 }
429430 }
430431
431- int prompt_token_len = prompt_speech_embeds_flow.size () / lToken2Wav. _attr .flow_embed_size ;
432+ int prompt_token_len = prompt_speech_embeds_flow.size () / lToken2Wav_-> _attr .flow_embed_size ;
432433 if (prompt_token_len < 75 ) {
433434 SLOGE (" Error, prompt speech token len %d < 75" , prompt_token_len);
434435 if (llm_thread.joinable ()) llm_thread.join ();
@@ -450,28 +451,28 @@ class llm_task {
450451 int token_offset = 0 ;
451452 int i = 0 ;
452453 while (true ) {
453- this_token_hop_len = (token_offset == 0 ) ? lToken2Wav. _attr .token_hop_len + promot_token_pad
454- : lToken2Wav. _attr .token_hop_len ;
454+ this_token_hop_len = (token_offset == 0 ) ? lToken2Wav_-> _attr .token_hop_len + promot_token_pad
455+ : lToken2Wav_-> _attr .token_hop_len ;
455456 std::unique_lock<std::mutex> lock (g_buffer_mutex);
456457 g_buffer_cv.wait (lock, [&] {
457458 return (g_token_buffer.size () - token_offset >=
458- this_token_hop_len + lToken2Wav. _attr .pre_lookahead_len ) ||
459+ this_token_hop_len + lToken2Wav_-> _attr .pre_lookahead_len ) ||
459460 g_llm_finished.load () || g_stop.load ();
460461 });
461462 if (g_stop) {
462463 lock.unlock ();
463464 break ;
464465 } else if (g_token_buffer.size () - token_offset >=
465- this_token_hop_len + lToken2Wav. _attr .pre_lookahead_len ) {
466+ this_token_hop_len + lToken2Wav_-> _attr .pre_lookahead_len ) {
466467 std::vector<SpeechToken> token;
467- int start = token_offset - std::min (int (token_offset / lToken2Wav. _attr .token_hop_len ),
468- lToken2Wav. _attr .max_infer_chunk_num - 1 ) *
469- lToken2Wav. _attr .token_hop_len ;
470- int end = token_offset + this_token_hop_len + lToken2Wav. _attr .pre_lookahead_len ;
468+ int start = token_offset - std::min (int (token_offset / lToken2Wav_-> _attr .token_hop_len ),
469+ lToken2Wav_-> _attr .max_infer_chunk_num - 1 ) *
470+ lToken2Wav_-> _attr .token_hop_len ;
471+ int end = token_offset + this_token_hop_len + lToken2Wav_-> _attr .pre_lookahead_len ;
471472 token.insert (token.end (), g_token_buffer.begin () + start, g_token_buffer.begin () + end);
472473 lock.unlock ();
473- auto speech = lToken2Wav. infer (token, prompt_speech_embeds_flow1, prompt_feat1, spk_embeds,
474- token_offset, false );
474+ auto speech = lToken2Wav_-> infer (token, prompt_speech_embeds_flow1, prompt_feat1, spk_embeds,
475+ token_offset, false );
475476 token_offset += this_token_hop_len;
476477 output.insert (output.end (), speech.begin (), speech.end ());
477478 double src_ratio =
@@ -507,12 +508,12 @@ class llm_task {
507508 }
508509
509510 std::vector<SpeechToken> token;
510- int start = g_token_buffer.size () - std::min (int (g_token_buffer.size () / lToken2Wav. _attr .token_hop_len ),
511- lToken2Wav. _attr .max_infer_chunk_num - 1 ) *
512- lToken2Wav. _attr .token_hop_len ;
511+ int start = g_token_buffer.size () - std::min (int (g_token_buffer.size () / lToken2Wav_-> _attr .token_hop_len ),
512+ lToken2Wav_-> _attr .max_infer_chunk_num - 1 ) *
513+ lToken2Wav_-> _attr .token_hop_len ;
513514 token.insert (token.end (), g_token_buffer.begin () + start, g_token_buffer.end ());
514- auto speech = lToken2Wav. infer (token, prompt_speech_embeds_flow1, prompt_feat1, spk_embeds,
515- token_offset - start, true );
515+ auto speech = lToken2Wav_-> infer (token, prompt_speech_embeds_flow1, prompt_feat1, spk_embeds,
516+ token_offset - start, true );
516517 output.insert (output.end (), speech.begin (), speech.end ());
517518 double src_ratio =
518519 static_cast <double >(mode_config_.audio_rate ) / static_cast <double >(mode_config_.mode_rate );
@@ -662,6 +663,9 @@ class llm_task {
662663 if (lLaMa_) {
663664 lLaMa_->Deinit ();
664665 }
666+ if (lToken2Wav_) {
667+ lToken2Wav_->Deinit ();
668+ }
665669 }
666670};
667671
0 commit comments