Skip to content

Commit 01d6715

Browse files
author
LittleMouse
committed
[update] update cosy_voice & new kws
1 parent 52a09b6 commit 01d6715

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+4589
-83
lines changed

projects/llm_framework/main/SConstruct

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ STATIC_FILES += [AFile('../static_lib/sherpa/ncnn/libsherpa-ncnn-core.so'),
2828
AFile('../static_lib/wetext/libfst.so.16'),
2929
AFile('../static_lib/libonnxruntime.so.1'),
3030
AFile('../static_lib/libonnxruntime.so.1.14.0'),
31-
AFile('../static_lib/libzmq.so.5')
31+
AFile('../static_lib/libzmq.so.5'),
32+
AFile('../static_lib/libMNN.so')
3233
]
3334

3435
env['COMPONENTS'].append({'target':'static_file-1.0',

projects/llm_framework/main_cosy_voice/SConstruct

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ LDFLAGS = []
1717
LINK_SEARCH_PATH = []
1818
STATIC_FILES = []
1919

20-
python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_llm-llm-python-venv_v1.7.tar.gz", 'm5stack_llm-llm-python-venv_v1.7.tar.gz')
20+
python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_llm-cosy-voice-python-venv_v1.7.tar.gz", 'm5stack_llm-cosy-voice-python-venv_v1.7.tar.gz')
2121

2222
# REQUIREMENTS += ['Backward_cpp']
2323
# DYNAMIC_LIB += [ AFile('../static_lib/libdw.so.1'),
@@ -29,25 +29,24 @@ python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/reso
2929
DEFINITIONS += ['-O2']
3030
DEFINITIONS += ['-std=c++17']
3131
LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./']
32-
REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys', 'utilities']
32+
REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys', 'utilities', 'MNN']
3333
LINK_SEARCH_PATH += [ADir('../static_lib')]
3434

3535

3636
INCLUDE += [ADir('src/runner'), ADir('src/runner/utils')]
3737
INCLUDE += [ADir('../static_lib/include/sentencepiece'),
3838
ADir('../static_lib/include/protobuf-lite'),
3939
ADir('../static_lib/include/abseil-cpp'),
40+
ADir('../static_lib/include/mnn'),
4041
ADir('../static_lib/include/re2')]
4142

4243
static_file = Glob('../static_lib/module-llm/libabsl_*')
4344
static_file += [AFile('../static_lib/module-llm/libre2.a'), AFile('../static_lib/module-llm/libsentencepiece.a'), AFile('../static_lib/module-llm/libsentencepiece_train.a')]
4445
STATIC_LIB += static_file * 4
4546

46-
# STATIC_FILES += [os.path.join(python_venv, 'llm')]
47-
# STATIC_FILES += Glob('scripts/tokenizer_*.py')
47+
STATIC_FILES += [os.path.join(python_venv, 'cosy-voice')]
4848
STATIC_FILES += Glob('scripts/tokenizer*')
4949
STATIC_FILES += Glob('models/mode_*.json')
50-
# STATIC_FILES += [AFile('scripts/llm-llm_tokenizer_auto.py')]
5150

5251
IGNORE_FILES = []
5352
IGNORE_FILES += ['llm']

projects/llm_framework/main_cosy_voice/models/mode_CosyVoice2-0.5B-ax650.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@
3838
"b_use_mmap_load_embed": true,
3939
"b_dynamic_load_axmodel_layer": false,
4040
"ext_scripts": [
41-
"tokenizer_cosyvoice2-0.5B-ax650.py"
41+
"tokenizer_CosyVoice2-0.5B-ax650.py",
42+
"tokenizer"
4243
]
4344
}
4445
}

projects/llm_framework/main_cosy_voice/src/main.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ class llm_task {
203203

204204
tokenizer_pid_ = fork();
205205
if (tokenizer_pid_ == 0) {
206-
setenv("PYTHONPATH", "/opt/m5stack/lib/cosy_voice/site-packages", 1);
206+
setenv("PYTHONPATH", "/opt/m5stack/lib/cosy-voice/site-packages", 1);
207207
const std::string port_str = std::to_string(port_);
208208
const std::string model_id = base_model + "tokenizer";
209209

@@ -266,11 +266,8 @@ class llm_task {
266266
if (!lToken2Wav.Init(mode_config_.token2wav_axmodel_dir, mode_config_.n_timesteps)) {
267267
return -1;
268268
}
269-
SLOGE();
270269
lLaMa_->TextToken2Embeds(prompt_text_token, prompt_text_embeds);
271-
SLOGE();
272270
lLaMa_->SpeechToken2Embeds(prompt_speech_token, prompt_speech_embeds);
273-
SLOGE();
274271
lToken2Wav.SpeechToken2Embeds(prompt_speech_token, prompt_speech_embeds_flow);
275272

276273
} catch (...) {

projects/llm_framework/main_cosy_voice/src/runner/Token2wav.hpp

Lines changed: 79 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
#include "timer.hpp"
2121
// #include "opencv2/opencv.hpp"
2222
#include "ax_sys_api.h"
23+
#include "MNN/MNNDefine.h"
24+
#include "MNN/MNNForwardType.h"
25+
#include "MNN/Interpreter.hpp"
2326

2427
class Token2Wav
2528
{
@@ -44,8 +47,15 @@ class Token2Wav
4447
ax_runner_ax650 flow_estimator_250;
4548
ax_runner_ax650 flow_estimator_300;
4649

47-
ax_runner_ax650 hift_50_first;
48-
ax_runner_ax650 hift_58;
50+
ax_runner_ax650 hift_p2_50_first;
51+
ax_runner_ax650 hift_p2_58;
52+
53+
std::shared_ptr<MNN::Interpreter> hift_p1_50_first = nullptr;
54+
std::shared_ptr<MNN::Interpreter> hift_p1_58 = nullptr;
55+
56+
MNN::Session * sess_hift_p1_50_first = nullptr;
57+
MNN::Session * sess_hift_p1_58 = nullptr;
58+
4959

5060
std::vector<float> rand_noise;
5161
std::vector<float> t_span;
@@ -161,20 +171,44 @@ class Token2Wav
161171
return false;
162172
}
163173

164-
ret = hift_50_first.init((model_dir+"/hift_50_first.axmodel").c_str(), false);
174+
ret = hift_p2_50_first.init((model_dir+"/hift_p2_50_first.axmodel").c_str(), false);
165175
if (ret != 0)
166176
{
167-
ALOGE("init axmodel(%s) failed", (model_dir+"/hift_50_first.axmodel").c_str());
177+
ALOGE("init axmodel(%s) failed", (model_dir+"/hift_p2_50_first.axmodel").c_str());
168178
return false;
169179
}
170180

171-
ret = hift_58.init((model_dir+"/hift_58.axmodel").c_str(), false);
181+
ret = hift_p2_58.init((model_dir+"/hift_p2_58.axmodel").c_str(), false);
172182
if (ret != 0)
173183
{
174-
ALOGE("init axmodel(%s) failed", (model_dir+"/hift_58.axmodel").c_str());
184+
ALOGE("init axmodel(%s) failed", (model_dir+"/hift_p2_58.axmodel").c_str());
175185
return false;
176186
}
177187

188+
MNN::ScheduleConfig config;
189+
config.numThread = 2;
190+
config.type = static_cast<MNNForwardType>(MNN_FORWARD_CPU);
191+
MNN::BackendConfig backendConfig;
192+
backendConfig.precision = (MNN::BackendConfig::PrecisionMode)1;
193+
config.backendConfig = &backendConfig;
194+
195+
hift_p1_50_first = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile( (model_dir+"/hift_p1_50_first.mnn").c_str() ));
196+
if(nullptr == hift_p1_50_first)
197+
{
198+
ALOGE("init mnn model(%s) failed", (model_dir+"/hift_p1_50_first.mnn").c_str());
199+
return false;
200+
}
201+
sess_hift_p1_50_first = hift_p1_50_first->createSession(config);
202+
203+
hift_p1_58 = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile( (model_dir+"/hift_p1_58.mnn").c_str() ));
204+
if(nullptr == hift_p1_58)
205+
{
206+
ALOGE("init mnn model(%s) failed", (model_dir+"/hift_p1_58.mnn").c_str() );
207+
return false;
208+
}
209+
210+
sess_hift_p1_58 = hift_p1_58->createSession(config);
211+
178212
ALOGI("Token2Wav init ok");
179213
return true;
180214
}
@@ -188,8 +222,8 @@ class Token2Wav
188222
flow_estimator_200.release();
189223
flow_estimator_250.release();
190224
flow_estimator_300.release();
191-
hift_50_first.release();
192-
hift_58.release();
225+
hift_p2_50_first.release();
226+
hift_p2_58.release();
193227
flow_embed_selector.Deinit();
194228
}
195229

@@ -318,39 +352,66 @@ class Token2Wav
318352
int infer_hift(std::vector<float> &mel, std::vector<float> &cache_source,
319353
std::vector<float> & tts_speech, std::vector<float> & tts_source)
320354
{
321-
ax_runner_ax650 * model;
355+
std::shared_ptr<MNN::Interpreter> model_p1;
356+
MNN::Session * sess_p1;
357+
ax_runner_ax650 * model_p2;
322358
int len = mel.size()/(80);
323359

324360
if(len == 50 && cache_source.empty())
325361
{
326-
model = &hift_50_first;
362+
model_p1 = hift_p1_50_first;
363+
sess_p1 = sess_hift_p1_50_first;
364+
model_p2 = &hift_p2_50_first;
327365
}else if(len == 58 && !cache_source.empty())
328366
{
329-
model = &hift_58;
367+
model_p1 = hift_p1_58;
368+
sess_p1 = sess_hift_p1_58;
369+
model_p2 = &hift_p2_58;
330370
}else
331371
{
332372
ALOGE("invalid size: %d", len);
333373
return -1;
334374
}
335375

336-
void * p = model->get_input("mel").pVirAddr;
376+
std::vector<int> dims{1, 80, len};
377+
auto tensor = MNN::Tensor::create<float>(dims, NULL, MNN::Tensor::CAFFE);
378+
auto p_tensor = tensor->host<float>();
379+
auto size = tensor->size();
380+
std::memcpy(p_tensor, mel.data(), size);
381+
382+
auto inputTensor = model_p1->getSessionInput(sess_p1, nullptr);
383+
inputTensor->copyFromHostTensor(tensor);
384+
385+
model_p1->runSession(sess_p1);
386+
387+
MNN::Tensor *p_out = model_p1->getSessionOutput(sess_p1, "s");
388+
MNN::Tensor out_host(p_out, p_out->getDimensionType());
389+
p_out->copyToHostTensor(&out_host);
390+
391+
auto p_s = out_host.host<float>();
392+
393+
void * p = model_p2->get_input("s").pVirAddr;
394+
memcpy(p, p_s, len * 480 * sizeof(float));
395+
396+
p = model_p2->get_input("mel").pVirAddr;
337397
memcpy(p, mel.data(), mel.size() * sizeof(float));
398+
338399
if(!cache_source.empty())
339400
{
340-
p = model->get_input("hift_cache_source").pVirAddr;
401+
p = model_p2->get_input("hift_cache_source").pVirAddr;
341402
memcpy(p, cache_source.data(), cache_source.size() * sizeof(float));
342403
}
343-
344-
model->inference();
345-
346-
auto &output_speech = model->get_output("audio");
404+
405+
model_p2->inference();
406+
407+
auto &output_speech = model_p2->get_output("audio");
347408
if(tts_speech.empty() || tts_speech.size() != output_speech.nSize / sizeof(float))
348409
{
349410
tts_speech.resize(output_speech.nSize / sizeof(float));
350411
}
351412
memcpy(tts_speech.data(), output_speech.pVirAddr, output_speech.nSize);
352413

353-
auto &output_source = model->get_output("x");
414+
auto &output_source = model_p2->get_output(1);
354415
if(tts_source.empty() || tts_source.size() != output_source.nSize / sizeof(float))
355416
{
356417
tts_source.resize(output_source.nSize / sizeof(float));

projects/llm_framework/main_kws_new/SConstruct

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,19 @@ DEFINITIONS += ['-std=c++17']
2222
LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./']
2323
LINK_SEARCH_PATH += [ADir('../static_lib')]
2424

25-
INCLUDE += [ADir('../static_lib/include/sherpa'),
26-
ADir('../static_lib/include/sherpa/sherpa-onnx'),
25+
INCLUDE += [
26+
# ADir('../static_lib/include/sherpa'),
27+
# ADir('../static_lib/include/sherpa/sherpa-onnx'),
2728
ADir('../static_lib/include/sherpa/sherpa-onnx/onnxruntime-src'),
28-
ADir('../static_lib/include/sherpa/sherpa-onnx/openfst-src')
29+
ADir('src/runner'),
30+
# ADir('../static_lib/include/sherpa/sherpa-onnx/openfst-src')
2931
]
3032

31-
LINK_SEARCH_PATH += [ADir('../static_lib/sherpa/onnx')]
32-
LDFLAGS += ['-l:libcargs.a',
33-
'-l:libsherpa-onnx-core.a', '-l:libkaldi-native-fbank-core.a',
34-
'-l:libkaldi-decoder-core.a', '-l:libssentencepiece_core.a']
33+
LINK_SEARCH_PATH += [ADir('../static_lib/sherpa/fbank')]
34+
LDFLAGS += [
35+
'-l:libkaldi-native-fbank-core.a',
36+
'-l:libkissfft-float.a',
37+
]
3538

3639
REQUIREMENTS += ['onnxruntime']
3740

0 commit comments

Comments
 (0)