Merge branch 'feat/wn9l' into 'master'

Feat/wn9l

See merge request speech-recognition-framework/esp-sr!192
This commit is contained in:
Sun Xiang Yu
2025-10-20 20:04:01 +08:00
42 changed files with 43 additions and 14 deletions

View File

@@ -1,5 +1,9 @@
# Change log for esp-sr
# 2.2.0
- Support WakeNet9l models
- Update TTS pipeline v3.0
## 2.1.5
- Fix ringbuf bug
- Add more wakenet9 models

View File

@@ -88,10 +88,18 @@ menu "Load Multiple Wake Words (WakeNet9)"
bool "小爱同学 (wn9_xiaoaitongxue)"
default False
config SR_WN_WN9L_XIAOAITONGXUE
bool "小爱同学 (wn9l_xiaoaitongxue)"
default False
config SR_WN_WN9_NIHAOXIAOZHI_TTS
bool "你好小智 (wn9_nihaoxiaozhi_tts)"
default False
config SR_WN_WN9L_NIHAOXIAOZHI_TTS3
bool "你好小智 (wn9l_nihaoxiaozhi_tts3)"
default False
config SR_WN_WN9_ALEXA
bool "Alexa (wn9_alexa)"
default False
@@ -263,6 +271,10 @@ menu "Load Multiple Wake Words (WakeNet9)"
config SR_WN_WN9_NI3HAO3XIAO3MAI4_TTS2
bool "你好小脉 (wn9_ni3hao3xiao3mai4_tts2)"
default False
config SR_WN_WN9_NI3HAO3XIAO3RUI4_TTS3
bool "你好小瑞 (wn9_ni3hao3xiao3rui4_tts3)"
default False
endmenu

View File

@@ -21,6 +21,7 @@ These algorithms are provided in the form of a component, so they can be integra
News
----
[20/10/2025]: We add a new model, WakeNet9l, which further improves the response rate of wake words spoken at extremely fast speeds based on WakeNet9. The usage of WakeNet9l is the same as WakeNet9, but its CPU and memory requirements are approximately 1.3 times higher than those of WakeNet9.
[21/4/2025]: We add a new model WakeNet9s, which can run on chips that do not have PSRAM and do not support SIMD, such as ESP32C3 and ESP32C5. [examples](https://github.com/espressif/esp-skainet/tree/master/examples/wake_word_detection)
[17/4/2025]: We add a new DOA(Direction of Arrival) algorithm.
[14/2/2025]: We release **ESP-SR V2.0**. [Migration from ESP-SR V1.* to ESP-SR V2.*](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/audio_front_end/migration_guide.html)
@@ -88,6 +89,7 @@ The following wake words are supported in esp-sr:
|你好东东 | | wn9_nihaodongdong_tts2|
|你好小安 | | wn9_nihaoxiaoan_tts2|
|你好小脉 | | wn9_ni3hao3xiao3mai4_tts2|
|你好小瑞 | | wn9_ni3hao3xiao3rui4_tts3|
**NOTE**:

View File

@@ -32,9 +32,9 @@ Please see the flow diagram of WakeNet below:
- Neural Network
Now, the neural network structure has been updated to the ninth edition, among which:
- WakeNet1, WakeNet2, WakeNet3, WakeNet4, WakeNet6, and WakeNet7 had been out of use.
- WakeNet5 only supports ESP32 chip.
- WakeNet8 and WakeNet9 only support ESP32-S3 chip, which are built upon the `Dilated Convolution <https://arxiv.org/pdf/1609.03499.pdf>`__ structure.
- WakeNet1, WakeNet2, WakeNet3, WakeNet4, WakeNet5, WakeNet6, and WakeNet7, WakeNet8 had been out of use.
- WakeNet9 and WakeNet9l support ESP32, ESP32S3, and ESP32P4 chips, which are built upon the `Dilated Convolution <https://arxiv.org/pdf/1609.03499.pdf>`__ structure. WakeNet9l further improves the recognition rate of wake words spoken at very fast speeds based on WakeNet9.
- WakeNet9s supports ESP32C3, ESP32C5 and ESP32C6 chip, which is built upon the `Depthwise Separable Convolution <https://arxiv.org/abs/1704.04861>`__ structure.
.. only:: latex

View File

@@ -207,6 +207,10 @@ WakeNet
| WakeNet9 @ 3 | | | | |
| channel | | | | |
+----------------+-------+---------+----------------+--------------+
| Quantised | 16 KB | 324 KB | 4.0 ms | 32 ms |
| WakeNet9l @ 2 | | | | |
| channel | | | | |
+----------------+-------+---------+----------------+--------------+
.. only:: esp32p4
@@ -223,6 +227,10 @@ WakeNet
| WakeNet9 @ 3 | | | | |
| channel | | | | |
+----------------+-------+---------+----------------+--------------+
| Quantised | 16 KB | 324 KB | 3.6 ms | 32 ms |
| WakeNet9l @ 2 | | | | |
| channel | | | | |
+----------------+-------+---------+----------------+--------------+
性能测试
~~~~~~~~

View File

@@ -32,9 +32,9 @@ WakeNet 的流程图如下:
- 神经网络 (Neural Network)
神经网络结构已经更新到第 9 版,其中:
- WakeNet1WakeNet2WakeNet3WakeNet4、WakeNet6 and WakeNet7 已经停止使用。
- WakeNet5 应用于 ESP32 芯片
- WakeNet8 和 WakeNet9 应用于 ESP32-S3 芯片,模型基于 `Dilated Convolution <https://arxiv.org/pdf/1609.03499.pdf>`__ 结构。
- WakeNet1, WakeNet2, WakeNet3, WakeNet4, WakeNet5, WakeNet6, WakeNet7 and WakeNet8 已经停止使用。
- WakeNet9 和 WakeNet9l 应用于 ESP32, ESP32S3, ESP32P4 芯片,模型基于 `Dilated Convolution <https://arxiv.org/pdf/1609.03499.pdf>`__ 结构
- WakeNet9s 应用于 ESP32C3, ESP32C5 and ESP32C6 芯片,模型基于 `Dilated Convolution <https://arxiv.org/pdf/1609.03499.pdf>`__ 结构。
.. only:: latex

View File

@@ -1,4 +1,4 @@
version: "2.1.5"
version: "2.2.0"
description: esp_sr provides basic algorithms for Speech Recognition applications
url: https://github.com/espressif/esp-sr
dependencies:

View File

@@ -46,7 +46,7 @@ typedef struct afe_fetch_result_t {
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy.
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is busy.
void *reserved; // reserved for future use
} afe_fetch_result_t;

View File

@@ -46,7 +46,7 @@ typedef struct afe_fetch_result_t {
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy.
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is busy.
void *reserved; // reserved for future use
} afe_fetch_result_t;

View File

@@ -46,7 +46,7 @@ typedef struct afe_fetch_result_t {
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy.
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is busy.
void *reserved; // reserved for future use
} afe_fetch_result_t;

View File

@@ -46,7 +46,7 @@ typedef struct afe_fetch_result_t {
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy.
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is busy.
void *reserved; // reserved for future use
} afe_fetch_result_t;

View File

@@ -46,7 +46,7 @@ typedef struct afe_fetch_result_t {
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy.
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is busy.
void *reserved; // reserved for future use
} afe_fetch_result_t;

View File

@@ -46,7 +46,7 @@ typedef struct afe_fetch_result_t {
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy.
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is busy.
void *reserved; // reserved for future use
} afe_fetch_result_t;

View File

@@ -46,7 +46,7 @@ typedef struct afe_fetch_result_t {
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy.
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is busy.
void *reserved; // reserved for future use
} afe_fetch_result_t;

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

@@ -0,0 +1 @@
wakenet9_tts2h12_Ni3Hao3Xiao3Rui4_3_0.610_0.619

View File

Binary file not shown.

View File

Binary file not shown.

View File

@@ -0,0 +1 @@
wakenet9l_tts2h24_Ni3Hao3Xiao3Zhi4_3_0.597_0.606

View File

Binary file not shown.

View File

Binary file not shown.

View File

@@ -0,0 +1 @@
wakenet9l_tts2h24_Xiaoaitongxue_3_0.581_0.601

View File

Binary file not shown.

View File

Binary file not shown.