diff --git a/CMakeLists.txt b/CMakeLists.txt index 3632661..78e32c7 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ # CMakeLists in this exact order for cmake to work correctly cmake_minimum_required(VERSION 3.16) -set(PROJECT_VER "0.3.0") +set(PROJECT_VER "0.3.1") include($ENV{IDF_PATH}/tools/cmake/project.cmake) project(xiaozhi) diff --git a/main/Application.cc b/main/Application.cc index d45b77b..d667a35 100644 --- a/main/Application.cc +++ b/main/Application.cc @@ -1,27 +1,28 @@ -#include "Application.h" -#include "BuiltinLed.h" -#include "TlsTransport.h" -#include "Ml307SslTransport.h" -#include "WifiConfigurationAp.h" -#include "WifiStation.h" +#include +#include +#include +#include +#include +#include #include -#include "esp_log.h" -#include "model_path.h" -#include "SystemInfo.h" -#include "cJSON.h" -#include "driver/gpio.h" +#include +#include +#include + +#include "Application.h" #define TAG "Application" Application::Application() + : button_((gpio_num_t)CONFIG_BOOT_BUTTON_GPIO) #ifdef CONFIG_USE_ML307 - : ml307_at_modem_(CONFIG_ML307_TX_PIN, CONFIG_ML307_RX_PIN, 4096), + , ml307_at_modem_(CONFIG_ML307_TX_PIN, CONFIG_ML307_RX_PIN, 4096), http_(ml307_at_modem_), firmware_upgrade_(http_) #else - : http_(), + , http_(), firmware_upgrade_(http_) #endif #ifdef CONFIG_USE_DISPLAY @@ -29,16 +30,6 @@ Application::Application() #endif { event_group_ = xEventGroupCreate(); - audio_encode_queue_ = xQueueCreate(100, sizeof(iovec)); - audio_decode_queue_ = xQueueCreate(100, sizeof(AudioPacket*)); - - srmodel_list_t *models = esp_srmodel_init("model"); - for (int i = 0; i < models->num; i++) { - ESP_LOGI(TAG, "Model %d: %s", i, models->model_name[i]); - if (strstr(models->model_name[i], ESP_WN_PREFIX) != NULL) { - wakenet_model_ = models->model_name[i]; - } - } opus_encoder_.Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, 1); opus_decoder_ = opus_decoder_create(opus_decode_sample_rate_, 1, NULL); @@ -52,32 +43,12 @@ Application::Application() } Application::~Application() { - if (afe_detection_data_ != nullptr) { - esp_afe_sr_v1.destroy(afe_detection_data_); - } - - if (afe_communication_data_ != nullptr) { - esp_afe_vc_v1.destroy(afe_communication_data_); - } - - if (wake_word_encode_task_stack_ != nullptr) { - free(wake_word_encode_task_stack_); - } - for (auto& pcm : wake_word_pcm_) { - free(pcm.iov_base); - } - if (opus_decoder_ != nullptr) { opus_decoder_destroy(opus_decoder_); } if (audio_encode_task_stack_ != nullptr) { free(audio_encode_task_stack_); } - if (audio_decode_task_stack_ != nullptr) { - free(audio_decode_task_stack_); - } - vQueueDelete(audio_decode_queue_); - vQueueDelete(audio_encode_queue_); vEventGroupDelete(event_group_); } @@ -204,43 +175,139 @@ void Application::Start() { } #endif + audio_device_.OnInputData([this](const int16_t* data, int size) { +#ifdef CONFIG_USE_AFE_SR + if (audio_processor_.IsRunning()) { + audio_processor_.Input(data, size); + } + if (wake_word_detect_.IsDetectionRunning()) { + wake_word_detect_.Feed(data, size); + } +#else + std::vector pcm(data, data + size); + Schedule([this, pcm = std::move(pcm)]() { + if (chat_state_ == kChatStateListening) { + std::lock_guard lock(mutex_); + audio_encode_queue_.emplace_back(std::move(pcm)); + cv_.notify_all(); + } + }); +#endif + }); + // Initialize the audio device audio_device_.Start(CONFIG_AUDIO_INPUT_SAMPLE_RATE, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE); - audio_device_.OnStateChanged([this]() { - if (audio_device_.playing()) { - SetChatState(kChatStateSpeaking); - } else { - // Check if communication is still running - if (xEventGroupGetBits(event_group_) & COMMUNICATION_RUNNING) { - SetChatState(kChatStateListening); - } else { - SetChatState(kChatStateIdle); - } - } - }); // OPUS encoder / decoder use a lot of stack memory const size_t opus_stack_size = 4096 * 8; audio_encode_task_stack_ = (StackType_t*)malloc(opus_stack_size); - xTaskCreateStatic([](void* arg) { + audio_encode_task_ = xTaskCreateStatic([](void* arg) { Application* app = (Application*)arg; app->AudioEncodeTask(); vTaskDelete(NULL); }, "opus_encode", opus_stack_size, this, 1, audio_encode_task_stack_, &audio_encode_task_buffer_); - audio_decode_task_stack_ = (StackType_t*)malloc(opus_stack_size); - xTaskCreateStatic([](void* arg) { - Application* app = (Application*)arg; - app->AudioDecodeTask(); - vTaskDelete(NULL); - }, "opus_decode", opus_stack_size, this, 1, audio_decode_task_stack_, &audio_decode_task_buffer_); - StartCommunication(); - StartDetection(); + xTaskCreate([](void* arg) { + Application* app = (Application*)arg; + app->AudioPlayTask(); + vTaskDelete(NULL); + }, "play_audio", 4096 * 2, this, 5, NULL); + +#ifdef CONFIG_USE_AFE_SR + wake_word_detect_.OnVadStateChange([this](bool speaking) { + Schedule([this, speaking]() { + auto& builtin_led = BuiltinLed::GetInstance(); + if (chat_state_ == kChatStateListening) { + if (speaking) { + builtin_led.SetRed(32); + } else { + builtin_led.SetRed(8); + } + builtin_led.TurnOn(); + } + }); + }); + + wake_word_detect_.OnWakeWordDetected([this]() { + Schedule([this]() { + if (chat_state_ == kChatStateIdle) { + // Encode the wake word data and start websocket client at the same time + // They both consume a lot of time (700ms), so we can do them in parallel + wake_word_detect_.EncodeWakeWordData(); + + SetChatState(kChatStateConnecting); + if (ws_client_ == nullptr) { + StartWebSocketClient(); + } + if (ws_client_ && ws_client_->IsConnected()) { + auto encoded = wake_word_detect_.GetWakeWordStream(); + // Send the wake word data to the server + ws_client_->Send(encoded.data(), encoded.size(), true); + opus_encoder_.ResetState(); + // Send a ready message to indicate the server that the wake word data is sent + SetChatState(kChatStateWakeWordDetected); + // If connected, the hello message is already sent, so we can start communication + audio_processor_.Start(); + ESP_LOGI(TAG, "Audio processor started"); + } else { + SetChatState(kChatStateIdle); + } + } else if (chat_state_ == kChatStateSpeaking) { + break_speaking_ = true; + } + + // Resume detection + wake_word_detect_.StartDetection(); + }); + }); + wake_word_detect_.StartDetection(); + + audio_processor_.OnOutput([this](std::vector&& data) { + Schedule([this, data = std::move(data)]() { + if (chat_state_ == kChatStateListening) { + std::lock_guard lock(mutex_); + audio_encode_queue_.emplace_back(std::move(data)); + cv_.notify_all(); + } + }); + }); +#endif // Blink the LED to indicate the device is running builtin_led.SetGreen(); builtin_led.BlinkOnce(); - xEventGroupSetBits(event_group_, DETECTION_RUNNING); + + button_.OnClick([this]() { + Schedule([this]() { + if (chat_state_ == kChatStateIdle) { + SetChatState(kChatStateConnecting); + StartWebSocketClient(); + + if (ws_client_ && ws_client_->IsConnected()) { + opus_encoder_.ResetState(); +#ifdef CONFIG_USE_AFE_SR + audio_processor_.Start(); +#endif + SetChatState(kChatStateListening); + ESP_LOGI(TAG, "Communication started"); + } else { + SetChatState(kChatStateIdle); + } + } else if (chat_state_ == kChatStateSpeaking) { + break_speaking_ = true; + } else if (chat_state_ == kChatStateListening) { + if (ws_client_ && ws_client_->IsConnected()) { + ws_client_->Close(); + } + } + }); + }); + + xTaskCreate([](void* arg) { + Application* app = (Application*)arg; + app->MainLoop(); + vTaskDelete(NULL); + }, "main_loop", 4096 * 2, this, 5, NULL); // Launch a task to check for new firmware version xTaskCreate([](void* arg) { @@ -259,6 +326,28 @@ void Application::Start() { #endif } +void Application::Schedule(std::function callback) { + std::lock_guard lock(mutex_); + main_tasks_.push_back(callback); + cv_.notify_all(); +} + +// The Main Loop controls the chat state and websocket connection +// If other tasks need to access the websocket or chat state, +// they should use Schedule to call this function +void Application::MainLoop() { + while (true) { + std::unique_lock lock(mutex_); + cv_.wait(lock, [this]() { + return !main_tasks_.empty(); + }); + auto task = std::move(main_tasks_.front()); + main_tasks_.pop_front(); + lock.unlock(); + task(); + } +} + void Application::SetChatState(ChatState state) { const char* state_str[] = { "idle", @@ -294,199 +383,27 @@ void Application::SetChatState(ChatState state) { builtin_led.SetBlue(); builtin_led.TurnOn(); break; - case kChatStateTesting: - builtin_led.SetRed(); - builtin_led.TurnOn(); - break; case kChatStateUpgrading: builtin_led.SetGreen(); builtin_led.StartContinuousBlink(100); break; } - std::lock_guard lock(mutex_); if (ws_client_ && ws_client_->IsConnected()) { cJSON* root = cJSON_CreateObject(); cJSON_AddStringToObject(root, "type", "state"); cJSON_AddStringToObject(root, "state", state_str[chat_state_]); char* json = cJSON_PrintUnformatted(root); + + std::lock_guard lock(mutex_); ws_client_->Send(json); cJSON_Delete(root); free(json); } } -void Application::StartCommunication() { - afe_config_t afe_config = { - .aec_init = false, - .se_init = true, - .vad_init = true, - .wakenet_init = false, - .voice_communication_init = true, - .voice_communication_agc_init = true, - .voice_communication_agc_gain = 10, - .vad_mode = VAD_MODE_3, - .wakenet_model_name = NULL, - .wakenet_model_name_2 = NULL, - .wakenet_mode = DET_MODE_90, - .afe_mode = SR_MODE_HIGH_PERF, - .afe_perferred_core = 0, - .afe_perferred_priority = 5, - .afe_ringbuf_size = 50, - .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, - .afe_linear_gain = 1.0, - .agc_mode = AFE_MN_PEAK_AGC_MODE_2, - .pcm_config = { - .total_ch_num = 1, - .mic_num = 1, - .ref_num = 0, - .sample_rate = CONFIG_AUDIO_INPUT_SAMPLE_RATE, - }, - .debug_init = false, - .debug_hook = {{ AFE_DEBUG_HOOK_MASE_TASK_IN, NULL }, { AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL }}, - .afe_ns_mode = NS_MODE_SSP, - .afe_ns_model_name = NULL, - .fixed_first_channel = true, - }; - - afe_communication_data_ = esp_afe_vc_v1.create_from_config(&afe_config); - - xTaskCreate([](void* arg) { - Application* app = (Application*)arg; - app->AudioCommunicationTask(); - vTaskDelete(NULL); - }, "audio_communication", 4096 * 2, this, 5, NULL); -} - -void Application::StartDetection() { - afe_config_t afe_config = { - .aec_init = false, - .se_init = true, - .vad_init = true, - .wakenet_init = true, - .voice_communication_init = false, - .voice_communication_agc_init = false, - .voice_communication_agc_gain = 10, - .vad_mode = VAD_MODE_3, - .wakenet_model_name = wakenet_model_, - .wakenet_model_name_2 = NULL, - .wakenet_mode = DET_MODE_90, - .afe_mode = SR_MODE_HIGH_PERF, - .afe_perferred_core = 0, - .afe_perferred_priority = 5, - .afe_ringbuf_size = 50, - .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, - .afe_linear_gain = 1.0, - .agc_mode = AFE_MN_PEAK_AGC_MODE_2, - .pcm_config = { - .total_ch_num = 1, - .mic_num = 1, - .ref_num = 0, - .sample_rate = CONFIG_AUDIO_INPUT_SAMPLE_RATE - }, - .debug_init = false, - .debug_hook = {{ AFE_DEBUG_HOOK_MASE_TASK_IN, NULL }, { AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL }}, - .afe_ns_mode = NS_MODE_SSP, - .afe_ns_model_name = NULL, - .fixed_first_channel = true, - }; - - afe_detection_data_ = esp_afe_sr_v1.create_from_config(&afe_config); - xTaskCreate([](void* arg) { - Application* app = (Application*)arg; - app->AudioFeedTask(); - vTaskDelete(NULL); - }, "audio_feed", 4096 * 2, this, 5, NULL); - - xTaskCreate([](void* arg) { - Application* app = (Application*)arg; - app->AudioDetectionTask(); - vTaskDelete(NULL); - }, "audio_detection", 4096 * 2, this, 5, NULL); -} - -void Application::AudioFeedTask() { - int chunk_size = esp_afe_vc_v1.get_feed_chunksize(afe_detection_data_); - int16_t buffer[chunk_size]; - ESP_LOGI(TAG, "Audio feed task started, chunk size: %d", chunk_size); - - while (true) { - audio_device_.Read(buffer, chunk_size); - - auto event_bits = xEventGroupGetBits(event_group_); - if (event_bits & DETECTION_RUNNING) { - esp_afe_sr_v1.feed(afe_detection_data_, buffer); - } else if (event_bits & COMMUNICATION_RUNNING) { - esp_afe_vc_v1.feed(afe_communication_data_, buffer); - } - } - - vTaskDelete(NULL); -} - -void Application::StoreWakeWordData(uint8_t* data, size_t size) { - // store audio data to wake_word_pcm_ - auto iov = (iovec){ - .iov_base = heap_caps_malloc(size, MALLOC_CAP_SPIRAM), - .iov_len = size - }; - memcpy(iov.iov_base, data, size); - wake_word_pcm_.push_back(iov); - // keep about 2 seconds of data, detect duration is 32ms (sample_rate == 16000, chunksize == 512) - while (wake_word_pcm_.size() > 2000 / 32) { - heap_caps_free(wake_word_pcm_.front().iov_base); - wake_word_pcm_.pop_front(); - } -} - -void Application::EncodeWakeWordData() { - if (wake_word_encode_task_stack_ == nullptr) { - wake_word_encode_task_stack_ = (StackType_t*)malloc(4096 * 8); - } - wake_word_encode_task_ = xTaskCreateStatic([](void* arg) { - Application* app = (Application*)arg; - auto start_time = esp_timer_get_time(); - // encode detect packets - OpusEncoder* encoder = new OpusEncoder(); - encoder->Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, 1, 60); - encoder->SetComplexity(0); - app->wake_word_opus_.resize(4096 * 4); - size_t offset = 0; - - for (auto& pcm: app->wake_word_pcm_) { - encoder->Encode(pcm, [app, &offset](const iovec opus) { - size_t protocol_size = sizeof(BinaryProtocol) + opus.iov_len; - if (offset + protocol_size < app->wake_word_opus_.size()) { - auto protocol = (BinaryProtocol*)(&app->wake_word_opus_[offset]); - protocol->version = htons(PROTOCOL_VERSION); - protocol->type = htons(0); - protocol->reserved = 0; - protocol->timestamp = htonl(app->audio_device_.playing() ? app->audio_device_.last_timestamp() : 0); - protocol->payload_size = htonl(opus.iov_len); - memcpy(protocol->payload, opus.iov_base, opus.iov_len); - offset += protocol_size; - } - }); - heap_caps_free(pcm.iov_base); - } - app->wake_word_pcm_.clear(); - app->wake_word_opus_.resize(offset); - - auto end_time = esp_timer_get_time(); - ESP_LOGI(TAG, "Encode wake word opus: %zu bytes in %lld ms", app->wake_word_opus_.size(), (end_time - start_time) / 1000); - xEventGroupSetBits(app->event_group_, WAKE_WORD_ENCODED); - delete encoder; - vTaskDelete(NULL); - }, "encode_detect_packets", 4096 * 8, this, 1, wake_word_encode_task_stack_, &wake_word_encode_task_buffer_); -} - -void Application::SendWakeWordData() { - ws_client_->Send(wake_word_opus_.data(), wake_word_opus_.size(), true); - wake_word_opus_.clear(); -} - -BinaryProtocol* Application::AllocateBinaryProtocol(void* payload, size_t payload_size) { - auto last_timestamp = audio_device_.playing() ? audio_device_.last_timestamp() : 0; +BinaryProtocol* Application::AllocateBinaryProtocol(const uint8_t* payload, size_t payload_size) { + auto last_timestamp = 0; auto protocol = (BinaryProtocol*)heap_caps_malloc(sizeof(BinaryProtocol) + payload_size, MALLOC_CAP_SPIRAM); protocol->version = htons(PROTOCOL_VERSION); protocol->type = htons(0); @@ -498,197 +415,34 @@ BinaryProtocol* Application::AllocateBinaryProtocol(void* payload, size_t payloa return protocol; } -void Application::CheckTestButton() { - if (gpio_get_level(GPIO_NUM_1) == 0) { - if (chat_state_ == kChatStateIdle) { - SetChatState(kChatStateTesting); - test_resampler_.Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE); - } - } else { - if (chat_state_ == kChatStateTesting) { - SetChatState(kChatStateIdle); - - // 创建新线程来处理音频播放 - xTaskCreate([](void* arg) { - Application* app = static_cast(arg); - app->PlayTestAudio(); - vTaskDelete(NULL); - }, "play_test_audio", 4096, this, 1, NULL); - } - } -} - -void Application::PlayTestAudio() { - // 写入音频数据到扬声器 - auto packet = new AudioPacket(); - packet->type = kAudioPacketTypeStart; - audio_device_.QueueAudioPacket(packet); - - for (auto& pcm : test_pcm_) { - packet = new AudioPacket(); - packet->type = kAudioPacketTypeData; - packet->pcm.resize(test_resampler_.GetOutputSamples(pcm.iov_len / 2)); - test_resampler_.Process((int16_t*)pcm.iov_base, pcm.iov_len / 2, packet->pcm.data()); - audio_device_.QueueAudioPacket(packet); - heap_caps_free(pcm.iov_base); - } - // 清除测试PCM数据 - test_pcm_.clear(); - - // 停止音频设备 - packet = new AudioPacket(); - packet->type = kAudioPacketTypeStop; - audio_device_.QueueAudioPacket(packet); -} - -void Application::AudioDetectionTask() { - auto chunk_size = esp_afe_sr_v1.get_fetch_chunksize(afe_detection_data_); - ESP_LOGI(TAG, "Audio detection task started, chunk size: %d", chunk_size); - - while (true) { - xEventGroupWaitBits(event_group_, DETECTION_RUNNING, pdFALSE, pdTRUE, portMAX_DELAY); - - auto res = esp_afe_sr_v1.fetch(afe_detection_data_); - if (res == nullptr || res->ret_value == ESP_FAIL) { - ESP_LOGE(TAG, "Error in AudioDetectionTask"); - if (res != nullptr) { - ESP_LOGI(TAG, "Error code: %d", res->ret_value); - } - continue;; - } - - // Store the wake word data for voice recognition, like who is speaking - StoreWakeWordData((uint8_t*)res->data, res->data_size); - - CheckTestButton(); - if (chat_state_ == kChatStateTesting) { - auto& builtin_led = BuiltinLed::GetInstance(); - if (res->vad_state == AFE_VAD_SPEECH) { - iovec iov = { - .iov_base = heap_caps_malloc(res->data_size, MALLOC_CAP_SPIRAM), - .iov_len = (size_t)res->data_size - }; - memcpy(iov.iov_base, res->data, res->data_size); - test_pcm_.push_back(iov); - builtin_led.SetRed(128); - } else { - builtin_led.SetRed(32); - } - builtin_led.TurnOn(); - continue; - } - - if (chat_state_ == kChatStateIdle && res->wakeup_state == WAKENET_DETECTED) { - xEventGroupClearBits(event_group_, DETECTION_RUNNING); - SetChatState(kChatStateConnecting); - - // Encode the wake word data and start websocket client at the same time - // They both consume a lot of time (700ms), so we can do them in parallel - EncodeWakeWordData(); - StartWebSocketClient(); - - // Here the websocket is done, and we also wait for the wake word data to be encoded - xEventGroupWaitBits(event_group_, WAKE_WORD_ENCODED, pdTRUE, pdTRUE, portMAX_DELAY); - - std::lock_guard lock(mutex_); - if (ws_client_ && ws_client_->IsConnected()) { - // Send the wake word data to the server - SendWakeWordData(); - // Send a ready message to indicate the server that the wake word data is sent - SetChatState(kChatStateWakeWordDetected); - opus_encoder_.ResetState(); - // If connected, the hello message is already sent, so we can start communication - xEventGroupSetBits(event_group_, COMMUNICATION_RUNNING); - ESP_LOGI(TAG, "Communication running"); - } else { - SetChatState(kChatStateIdle); - xEventGroupSetBits(event_group_, DETECTION_RUNNING); - } - } - } -} - -void Application::AudioCommunicationTask() { - int chunk_size = esp_afe_vc_v1.get_fetch_chunksize(afe_communication_data_); - ESP_LOGI(TAG, "Audio communication task started, chunk size: %d", chunk_size); - - while (true) { - xEventGroupWaitBits(event_group_, COMMUNICATION_RUNNING, pdFALSE, pdTRUE, portMAX_DELAY); - - auto res = esp_afe_vc_v1.fetch(afe_communication_data_); - if (res == nullptr || res->ret_value == ESP_FAIL) { - ESP_LOGE(TAG, "Error in AudioCommunicationTask"); - if (res != nullptr) { - ESP_LOGI(TAG, "Error code: %d", res->ret_value); - } - continue; - } - - // Check if the websocket client is disconnected by the server - { - std::lock_guard lock(mutex_); - if (ws_client_ == nullptr || !ws_client_->IsConnected()) { - xEventGroupClearBits(event_group_, COMMUNICATION_RUNNING); - if (audio_device_.playing()) { - audio_device_.Break(); - } - SetChatState(kChatStateIdle); - if (ws_client_ != nullptr) { - delete ws_client_; - ws_client_ = nullptr; - } - xEventGroupSetBits(event_group_, DETECTION_RUNNING); - continue; - } - } - - if (chat_state_ == kChatStateListening) { - // Update the LED state based on the VAD state - auto& builtin_led = BuiltinLed::GetInstance(); - if (res->vad_state == AFE_VAD_SPEECH) { - builtin_led.SetRed(128); - } else { - builtin_led.SetRed(32); - } - builtin_led.TurnOn(); - - // Send audio data to server - iovec data = { - .iov_base = malloc(res->data_size), - .iov_len = (size_t)res->data_size - }; - memcpy(data.iov_base, res->data, res->data_size); - xQueueSend(audio_encode_queue_, &data, portMAX_DELAY); - } - } -} - void Application::AudioEncodeTask() { ESP_LOGI(TAG, "Audio encode task started"); while (true) { - iovec pcm; - xQueueReceive(audio_encode_queue_, &pcm, portMAX_DELAY); - - // Encode audio data - opus_encoder_.Encode(pcm, [this](const iovec opus) { - auto protocol = AllocateBinaryProtocol(opus.iov_base, opus.iov_len); - std::lock_guard lock(mutex_); - if (ws_client_ && ws_client_->IsConnected()) { - ws_client_->Send(protocol, sizeof(BinaryProtocol) + opus.iov_len, true); - } - heap_caps_free(protocol); + std::unique_lock lock(mutex_); + cv_.wait(lock, [this]() { + return !audio_encode_queue_.empty() || !audio_decode_queue_.empty(); }); - free(pcm.iov_base); - } -} + if (!audio_encode_queue_.empty()) { + auto pcm = std::move(audio_encode_queue_.front()); + audio_encode_queue_.pop_front(); + lock.unlock(); -void Application::AudioDecodeTask() { - while (true) { - AudioPacket* packet; - xQueueReceive(audio_decode_queue_, &packet, portMAX_DELAY); + // Encode audio data + opus_encoder_.Encode(pcm, [this](const uint8_t* opus, size_t opus_size) { + auto protocol = AllocateBinaryProtocol(opus, opus_size); + Schedule([this, protocol, opus_size]() { + if (ws_client_ && ws_client_->IsConnected()) { + ws_client_->Send(protocol, sizeof(BinaryProtocol) + opus_size, true); + } + heap_caps_free(protocol); + }); + }); + } else if (!audio_decode_queue_.empty()) { + auto packet = std::move(audio_decode_queue_.front()); + audio_decode_queue_.pop_front(); + lock.unlock(); - if (packet->type == kAudioPacketTypeData) { int frame_size = opus_decode_sample_rate_ / 1000 * opus_duration_ms_; packet->pcm.resize(frame_size); @@ -705,9 +459,74 @@ void Application::AudioDecodeTask() { opus_resampler_.Process(packet->pcm.data(), frame_size, resampled.data()); packet->pcm = std::move(resampled); } + + std::lock_guard lock(mutex_); + audio_play_queue_.push_back(packet); + cv_.notify_all(); + } + } +} + +void Application::HandleAudioPacket(AudioPacket* packet) { + switch (packet->type) + { + case kAudioPacketTypeData: { + if (skip_to_end_) { + break; } - audio_device_.QueueAudioPacket(packet); + // This will block until the audio device has finished playing the audio + audio_device_.OutputData(packet->pcm); + + if (break_speaking_) { + break_speaking_ = false; + skip_to_end_ = true; + + // Play a silence and skip to the end + int frame_size = opus_decode_sample_rate_ / 1000 * opus_duration_ms_; + std::vector silence(frame_size); + bzero(silence.data(), silence.size() * sizeof(int16_t)); + audio_device_.OutputData(silence); + } + break; + } + case kAudioPacketTypeStart: + Schedule([this]() { + SetChatState(kChatStateSpeaking); + }); + break; + case kAudioPacketTypeStop: + skip_to_end_ = false; + Schedule([this]() { + SetChatState(kChatStateListening); + }); + break; + case kAudioPacketTypeSentenceStart: + ESP_LOGI(TAG, "<< %s", packet->text.c_str()); + break; + case kAudioPacketTypeSentenceEnd: + break; + default: + ESP_LOGI(TAG, "Unknown packet type: %d", packet->type); + break; + } + + delete packet; +} + +void Application::AudioPlayTask() { + ESP_LOGI(TAG, "Audio play task started"); + + while (true) { + std::unique_lock lock(mutex_); + cv_.wait(lock, [this]() { + return !audio_play_queue_.empty(); + }); + auto packet = std::move(audio_play_queue_.front()); + audio_play_queue_.pop_front(); + lock.unlock(); + + HandleAudioPacket(packet); } } @@ -726,6 +545,7 @@ void Application::SetDecodeSampleRate(int sample_rate) { void Application::StartWebSocketClient() { if (ws_client_ != nullptr) { + ESP_LOGW(TAG, "WebSocket client already exists"); delete ws_client_; } @@ -746,7 +566,6 @@ void Application::StartWebSocketClient() { // keys: message type, version, wakeup_model, audio_params (format, sample_rate, channels) std::string message = "{"; message += "\"type\":\"hello\","; - message += "\"wakeup_model\":\"" + std::string(wakenet_model_) + "\","; message += "\"audio_params\":{"; message += "\"format\":\"opus\", \"sample_rate\":" + std::to_string(CONFIG_AUDIO_INPUT_SAMPLE_RATE) + ", \"channels\":1"; message += "}}"; @@ -763,7 +582,10 @@ void Application::StartWebSocketClient() { auto payload_size = ntohl(protocol->payload_size); packet->opus.resize(payload_size); memcpy(packet->opus.data(), protocol->payload, payload_size); - xQueueSend(audio_decode_queue_, &packet, portMAX_DELAY); + + std::lock_guard lock(mutex_); + audio_decode_queue_.push_back(packet); + cv_.notify_all(); } else { // Parse JSON data auto root = cJSON_Parse(data); @@ -786,7 +608,10 @@ void Application::StartWebSocketClient() { packet->type = kAudioPacketTypeSentenceStart; packet->text = cJSON_GetObjectItem(root, "text")->valuestring; } - xQueueSend(audio_decode_queue_, &packet, portMAX_DELAY); + + std::lock_guard lock(mutex_); + audio_decode_queue_.push_back(packet); + cv_.notify_all(); } else if (strcmp(type->valuestring, "stt") == 0) { auto text = cJSON_GetObjectItem(root, "text"); if (text != NULL) { @@ -804,6 +629,14 @@ void Application::StartWebSocketClient() { ws_client_->OnDisconnected([this]() { ESP_LOGI(TAG, "Websocket disconnected"); + Schedule([this]() { +#ifdef CONFIG_USE_AFE_SR + audio_processor_.Stop(); +#endif + delete ws_client_; + ws_client_ = nullptr; + SetChatState(kChatStateIdle); + }); }); if (!ws_client_->Connect(CONFIG_WEBSOCKET_URL)) { diff --git a/main/Application.h b/main/Application.h index f695a16..6ad5d0f 100644 --- a/main/Application.h +++ b/main/Application.h @@ -2,28 +2,33 @@ #define _APPLICATION_H_ #include "AudioDevice.h" -#include "OpusEncoder.h" -#include "OpusResampler.h" -#include "WebSocket.h" -#include "Display.h" -#include "Ml307AtModem.h" -#include "FirmwareUpgrade.h" -#include "Ml307Http.h" -#include "EspHttp.h" +#include +#include +#include +#include +#include +#include -#include "opus.h" -#include "resampler_structs.h" -#include "freertos/event_groups.h" -#include "freertos/queue.h" -#include "freertos/task.h" -#include "esp_afe_sr_models.h" -#include "esp_nsn_models.h" +#include +#include +#include +#include #include #include +#include + +#include "Display.h" +#include "FirmwareUpgrade.h" + +#ifdef CONFIG_USE_AFE_SR +#include "WakeWordDetect.h" +#include "AudioProcessor.h" +#endif + +#include "Button.h" #define DETECTION_RUNNING 1 #define COMMUNICATION_RUNNING 2 -#define WAKE_WORD_ENCODED 4 #define PROTOCOL_VERSION 2 struct BinaryProtocol { @@ -35,6 +40,23 @@ struct BinaryProtocol { uint8_t payload[]; } __attribute__((packed)); +enum AudioPacketType { + kAudioPacketTypeUnkonwn = 0, + kAudioPacketTypeStart, + kAudioPacketTypeStop, + kAudioPacketTypeData, + kAudioPacketTypeSentenceStart, + kAudioPacketTypeSentenceEnd +}; + +struct AudioPacket { + AudioPacketType type = kAudioPacketTypeUnkonwn; + std::string text; + std::vector opus; + std::vector pcm; + uint32_t timestamp; +}; + enum ChatState { kChatStateIdle, @@ -42,7 +64,6 @@ enum ChatState { kChatStateListening, kChatStateSpeaking, kChatStateWakeWordDetected, - kChatStateTesting, kChatStateUpgrading }; @@ -63,7 +84,12 @@ private: Application(); ~Application(); + Button button_; AudioDevice audio_device_; +#ifdef CONFIG_USE_AFE_SR + WakeWordDetect wake_word_detect_; + AudioProcessor audio_processor_; +#endif #ifdef CONFIG_USE_ML307 Ml307AtModem ml307_at_modem_; Ml307Http http_; @@ -74,25 +100,22 @@ private: #ifdef CONFIG_USE_DISPLAY Display display_; #endif - - std::recursive_mutex mutex_; + std::mutex mutex_; + std::condition_variable_any cv_; + std::list> main_tasks_; WebSocket* ws_client_ = nullptr; - esp_afe_sr_data_t* afe_detection_data_ = nullptr; - esp_afe_sr_data_t* afe_communication_data_ = nullptr; EventGroupHandle_t event_group_; - char* wakenet_model_ = NULL; volatile ChatState chat_state_ = kChatStateIdle; + volatile bool break_speaking_ = false; + bool skip_to_end_ = false; // Audio encode / decode - TaskHandle_t audio_feed_task_ = nullptr; + TaskHandle_t audio_encode_task_ = nullptr; StaticTask_t audio_encode_task_buffer_; StackType_t* audio_encode_task_stack_ = nullptr; - QueueHandle_t audio_encode_queue_ = nullptr; - - TaskHandle_t audio_decode_task_ = nullptr; - StaticTask_t audio_decode_task_buffer_; - StackType_t* audio_decode_task_stack_ = nullptr; - QueueHandle_t audio_decode_queue_ = nullptr; + std::list> audio_encode_queue_; + std::list audio_decode_queue_; + std::list audio_play_queue_; OpusEncoder opus_encoder_; OpusDecoder* opus_decoder_ = nullptr; @@ -100,38 +123,23 @@ private: int opus_duration_ms_ = 60; int opus_decode_sample_rate_ = CONFIG_AUDIO_OUTPUT_SAMPLE_RATE; OpusResampler opus_resampler_; - OpusResampler test_resampler_; - std::vector test_pcm_; - - TaskHandle_t wake_word_encode_task_ = nullptr; - StaticTask_t wake_word_encode_task_buffer_; - StackType_t* wake_word_encode_task_stack_ = nullptr; - std::list wake_word_pcm_; - std::string wake_word_opus_; TaskHandle_t check_new_version_task_ = nullptr; StaticTask_t check_new_version_task_buffer_; StackType_t* check_new_version_task_stack_ = nullptr; - BinaryProtocol* AllocateBinaryProtocol(void* payload, size_t payload_size); + void MainLoop(); + void Schedule(std::function callback); + BinaryProtocol* AllocateBinaryProtocol(const uint8_t* payload, size_t payload_size); void SetDecodeSampleRate(int sample_rate); void SetChatState(ChatState state); - void StartDetection(); - void StartCommunication(); void StartWebSocketClient(); - void StoreWakeWordData(uint8_t* data, size_t size); - void EncodeWakeWordData(); - void SendWakeWordData(); - void CheckTestButton(); - void PlayTestAudio(); void CheckNewVersion(); void UpdateDisplay(); - - void AudioFeedTask(); - void AudioDetectionTask(); - void AudioCommunicationTask(); + void AudioEncodeTask(); - void AudioDecodeTask(); + void AudioPlayTask(); + void HandleAudioPacket(AudioPacket* packet); }; #endif // _APPLICATION_H_ diff --git a/main/AudioDevice.cc b/main/AudioDevice.cc index 3d06d39..5f7fac7 100644 --- a/main/AudioDevice.cc +++ b/main/AudioDevice.cc @@ -1,18 +1,15 @@ #include "AudioDevice.h" -#include "esp_log.h" +#include #include #define TAG "AudioDevice" AudioDevice::AudioDevice() { - audio_play_queue_ = xQueueCreate(100, sizeof(AudioPacket*)); } AudioDevice::~AudioDevice() { - vQueueDelete(audio_play_queue_); - - if (audio_play_task_ != nullptr) { - vTaskDelete(audio_play_task_); + if (audio_input_task_ != nullptr) { + vTaskDelete(audio_input_task_); } if (rx_handle_ != nullptr) { ESP_ERROR_CHECK(i2s_channel_disable(rx_handle_)); @@ -37,8 +34,8 @@ void AudioDevice::Start(int input_sample_rate, int output_sample_rate) { xTaskCreate([](void* arg) { auto audio_device = (AudioDevice*)arg; - audio_device->AudioPlayTask(); - }, "audio_play", 4096 * 4, this, 5, &audio_play_task_); + audio_device->InputTask(); + }, "audio_input", 4096 * 2, this, 5, &audio_input_task_); } void AudioDevice::CreateDuplexChannels() { @@ -180,58 +177,22 @@ int AudioDevice::Read(int16_t* dest, int samples) { return samples; } -void AudioDevice::QueueAudioPacket(AudioPacket* packet) { - xQueueSend(audio_play_queue_, &packet, portMAX_DELAY); +void AudioDevice::OnInputData(std::function callback) { + on_input_data_ = callback; } -void AudioDevice::AudioPlayTask() { - while (true) { - AudioPacket* packet; - xQueueReceive(audio_play_queue_, &packet, portMAX_DELAY); +void AudioDevice::OutputData(std::vector& data) { + Write(data.data(), data.size()); +} - switch (packet->type) - { - case kAudioPacketTypeStart: - playing_ = true; - breaked_ = false; - if (on_state_changed_) { - on_state_changed_(); - } - break; - case kAudioPacketTypeStop: - playing_ = false; - if (on_state_changed_) { - on_state_changed_(); - } - break; - case kAudioPacketTypeSentenceStart: - ESP_LOGI(TAG, "<< %s", packet->text.c_str()); - break; - case kAudioPacketTypeSentenceEnd: - if (breaked_) { // Clear the queue - AudioPacket* p; - while (xQueueReceive(audio_play_queue_, &p, 0) == pdTRUE) { - delete p; - } - breaked_ = false; - playing_ = false; - } - break; - case kAudioPacketTypeData: - Write(packet->pcm.data(), packet->pcm.size()); - last_timestamp_ = packet->timestamp; - break; - default: - ESP_LOGE(TAG, "Unknown audio packet type: %d", packet->type); +void AudioDevice::InputTask() { + int duration = 30; + int input_frame_size = input_sample_rate_ / 1000 * duration; + int16_t input_buffer[input_frame_size]; + while (true) { + int samples = Read(input_buffer, input_frame_size); + if (samples > 0) { + on_input_data_(input_buffer, samples); } - delete packet; } } - -void AudioDevice::OnStateChanged(std::function callback) { - on_state_changed_ = callback; -} - -void AudioDevice::Break() { - breaked_ = true; -} diff --git a/main/AudioDevice.h b/main/AudioDevice.h index ad591ea..84bbed5 100644 --- a/main/AudioDevice.h +++ b/main/AudioDevice.h @@ -1,71 +1,44 @@ #ifndef _AUDIO_DEVICE_H #define _AUDIO_DEVICE_H -#include "opus.h" -#include "freertos/FreeRTOS.h" -#include "freertos/queue.h" -#include "freertos/event_groups.h" -#include "driver/i2s_std.h" +#include +#include +#include #include #include #include -enum AudioPacketType { - kAudioPacketTypeUnkonwn = 0, - kAudioPacketTypeStart, - kAudioPacketTypeStop, - kAudioPacketTypeData, - kAudioPacketTypeSentenceStart, - kAudioPacketTypeSentenceEnd -}; - -struct AudioPacket { - AudioPacketType type = kAudioPacketTypeUnkonwn; - std::string text; - std::vector opus; - std::vector pcm; - uint32_t timestamp; -}; - class AudioDevice { public: AudioDevice(); ~AudioDevice(); void Start(int input_sample_rate, int output_sample_rate); - int Read(int16_t* dest, int samples); - void Write(const int16_t* data, int samples); - void QueueAudioPacket(AudioPacket* packet); - void OnStateChanged(std::function callback); - void Break(); + void OnInputData(std::function callback); + void OutputData(std::vector& data); int input_sample_rate() const { return input_sample_rate_; } int output_sample_rate() const { return output_sample_rate_; } bool duplex() const { return duplex_; } - bool playing() const { return playing_; } - uint32_t last_timestamp() const { return last_timestamp_; } private: - bool playing_ = false; - bool breaked_ = false; bool duplex_ = false; int input_sample_rate_ = 0; int output_sample_rate_ = 0; - uint32_t last_timestamp_ = 0; - i2s_chan_handle_t tx_handle_ = nullptr; i2s_chan_handle_t rx_handle_ = nullptr; - QueueHandle_t audio_play_queue_ = nullptr; - TaskHandle_t audio_play_task_ = nullptr; + TaskHandle_t audio_input_task_ = nullptr; EventGroupHandle_t event_group_; - std::function on_state_changed_; + std::function on_input_data_; void CreateDuplexChannels(); void CreateSimplexChannels(); - void AudioPlayTask(); + void InputTask(); + int Read(int16_t* dest, int samples); + void Write(const int16_t* data, int samples); }; #endif // _AUDIO_DEVICE_H diff --git a/main/AudioProcessor.cc b/main/AudioProcessor.cc new file mode 100644 index 0000000..e710ba4 --- /dev/null +++ b/main/AudioProcessor.cc @@ -0,0 +1,106 @@ +#include "AudioProcessor.h" +#include + +#define PROCESSOR_RUNNING 0x01 + +static const char* TAG = "AudioProcessor"; + +AudioProcessor::AudioProcessor() + : afe_communication_data_(nullptr) { + event_group_ = xEventGroupCreate(); + + afe_config_t afe_config = { + .aec_init = false, + .se_init = true, + .vad_init = false, + .wakenet_init = false, + .voice_communication_init = true, + .voice_communication_agc_init = true, + .voice_communication_agc_gain = 10, + .vad_mode = VAD_MODE_3, + .wakenet_model_name = NULL, + .wakenet_model_name_2 = NULL, + .wakenet_mode = DET_MODE_90, + .afe_mode = SR_MODE_HIGH_PERF, + .afe_perferred_core = 0, + .afe_perferred_priority = 5, + .afe_ringbuf_size = 50, + .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, + .afe_linear_gain = 1.0, + .agc_mode = AFE_MN_PEAK_AGC_MODE_2, + .pcm_config = { + .total_ch_num = 1, + .mic_num = 1, + .ref_num = 0, + .sample_rate = CONFIG_AUDIO_INPUT_SAMPLE_RATE, + }, + .debug_init = false, + .debug_hook = {{ AFE_DEBUG_HOOK_MASE_TASK_IN, NULL }, { AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL }}, + .afe_ns_mode = NS_MODE_SSP, + .afe_ns_model_name = NULL, + .fixed_first_channel = true, + }; + + afe_communication_data_ = esp_afe_vc_v1.create_from_config(&afe_config); + + xTaskCreate([](void* arg) { + auto this_ = (AudioProcessor*)arg; + this_->AudioProcessorTask(); + vTaskDelete(NULL); + }, "audio_communication", 4096 * 2, this, 5, NULL); +} + +AudioProcessor::~AudioProcessor() { + if (afe_communication_data_ != nullptr) { + esp_afe_vc_v1.destroy(afe_communication_data_); + } + vEventGroupDelete(event_group_); +} + +void AudioProcessor::Input(const int16_t* data, int size) { + input_buffer_.insert(input_buffer_.end(), data, data + size); + + auto chunk_size = esp_afe_vc_v1.get_feed_chunksize(afe_communication_data_); + while (input_buffer_.size() >= chunk_size) { + auto chunk = input_buffer_.data(); + esp_afe_vc_v1.feed(afe_communication_data_, chunk); + input_buffer_.erase(input_buffer_.begin(), input_buffer_.begin() + chunk_size); + } +} + +void AudioProcessor::Start() { + xEventGroupSetBits(event_group_, PROCESSOR_RUNNING); +} + +void AudioProcessor::Stop() { + xEventGroupClearBits(event_group_, PROCESSOR_RUNNING); +} + +bool AudioProcessor::IsRunning() { + return xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING; +} + +void AudioProcessor::OnOutput(std::function&& data)> callback) { + output_callback_ = callback; +} + +void AudioProcessor::AudioProcessorTask() { + int chunk_size = esp_afe_vc_v1.get_fetch_chunksize(afe_communication_data_); + ESP_LOGI(TAG, "Audio communication task started, chunk size: %d", chunk_size); + + while (true) { + xEventGroupWaitBits(event_group_, PROCESSOR_RUNNING, pdFALSE, pdTRUE, portMAX_DELAY); + + auto res = esp_afe_vc_v1.fetch(afe_communication_data_); + if (res == nullptr || res->ret_value == ESP_FAIL) { + if (res != nullptr) { + ESP_LOGI(TAG, "Error code: %d", res->ret_value); + } + continue; + } + + if (output_callback_) { + output_callback_(std::vector(res->data, res->data + res->data_size / sizeof(int16_t))); + } + } +} diff --git a/main/AudioProcessor.h b/main/AudioProcessor.h new file mode 100644 index 0000000..993a170 --- /dev/null +++ b/main/AudioProcessor.h @@ -0,0 +1,33 @@ +#ifndef AUDIO_PROCESSOR_H +#define AUDIO_PROCESSOR_H + +#include +#include +#include +#include + +#include +#include +#include + +class AudioProcessor { +public: + AudioProcessor(); + ~AudioProcessor(); + + void Input(const int16_t* data, int size); + void Start(); + void Stop(); + bool IsRunning(); + void OnOutput(std::function&& data)> callback); + +private: + EventGroupHandle_t event_group_ = nullptr; + esp_afe_sr_data_t* afe_communication_data_ = nullptr; + std::vector input_buffer_; + std::function&& data)> output_callback_; + + void AudioProcessorTask(); +}; + +#endif diff --git a/main/Button.cc b/main/Button.cc new file mode 100644 index 0000000..5dcaf3c --- /dev/null +++ b/main/Button.cc @@ -0,0 +1,67 @@ +#include "Button.h" +#include + +static const char* TAG = "Button"; + +Button::Button(gpio_num_t gpio_num) : gpio_num_(gpio_num) { + button_config_t button_config = { + .type = BUTTON_TYPE_GPIO, + .long_press_time = 3000, + .short_press_time = 100, + .gpio_button_config = { + .gpio_num = gpio_num, + .active_level = 0 + } + }; + button_handle_ = iot_button_create(&button_config); + if (button_handle_ == NULL) { + ESP_LOGE(TAG, "Failed to create button handle"); + return; + } +} + +Button::~Button() { + if (button_handle_ != NULL) { + iot_button_delete(button_handle_); + } +} + +void Button::OnPress(std::function callback) { + on_press_ = callback; + iot_button_register_cb(button_handle_, BUTTON_PRESS_DOWN, [](void* handle, void* usr_data) { + Button* button = static_cast(usr_data); + if (button->on_press_) { + button->on_press_(); + } + }, this); +} + +void Button::OnLongPress(std::function callback) { + on_long_press_ = callback; + iot_button_register_cb(button_handle_, BUTTON_LONG_PRESS_START, [](void* handle, void* usr_data) { + Button* button = static_cast(usr_data); + if (button->on_long_press_) { + button->on_long_press_(); + } + }, this); +} + +void Button::OnClick(std::function callback) { + on_click_ = callback; + iot_button_register_cb(button_handle_, BUTTON_SINGLE_CLICK, [](void* handle, void* usr_data) { + Button* button = static_cast(usr_data); + if (button->on_click_) { + button->on_click_(); + } + }, this); +} + +void Button::OnDoubleClick(std::function callback) { + on_double_click_ = callback; + iot_button_register_cb(button_handle_, BUTTON_DOUBLE_CLICK, [](void* handle, void* usr_data) { + Button* button = static_cast(usr_data); + if (button->on_double_click_) { + button->on_double_click_(); + } + }, this); +} diff --git a/main/Button.h b/main/Button.h new file mode 100644 index 0000000..347df5c --- /dev/null +++ b/main/Button.h @@ -0,0 +1,28 @@ +#ifndef BUTTON_H_ +#define BUTTON_H_ + +#include +#include +#include + +class Button { +public: + Button(gpio_num_t gpio_num); + ~Button(); + + void OnPress(std::function callback); + void OnLongPress(std::function callback); + void OnClick(std::function callback); + void OnDoubleClick(std::function callback); +private: + gpio_num_t gpio_num_; + button_handle_t button_handle_; + + + std::function on_press_; + std::function on_long_press_; + std::function on_click_; + std::function on_double_click_; +}; + +#endif // BUTTON_H_ diff --git a/main/CMakeLists.txt b/main/CMakeLists.txt index d922902..803fef2 100755 --- a/main/CMakeLists.txt +++ b/main/CMakeLists.txt @@ -4,9 +4,14 @@ set(SOURCES "AudioDevice.cc" "SystemReset.cc" "Application.cc" "Display.cc" + "Button.cc" "main.cc" ) +if(CONFIG_USE_AFE_SR) + list(APPEND SOURCES "AudioProcessor.cc" "WakeWordDetect.cc") +endif() + idf_component_register(SRCS ${SOURCES} INCLUDE_DIRS "." ) diff --git a/main/Display.cc b/main/Display.cc index f46da1b..6f141be 100644 --- a/main/Display.cc +++ b/main/Display.cc @@ -1,11 +1,11 @@ #include "Display.h" -#include "esp_log.h" -#include "esp_err.h" -#include "esp_lcd_panel_ops.h" -#include "esp_lcd_panel_vendor.h" -#include "esp_lvgl_port.h" +#include +#include +#include +#include +#include #include #include diff --git a/main/Display.h b/main/Display.h index 9b9c614..308778b 100644 --- a/main/Display.h +++ b/main/Display.h @@ -1,10 +1,10 @@ #ifndef DISPLAY_H #define DISPLAY_H -#include "driver/i2c_master.h" -#include "esp_lcd_panel_io.h" -#include "esp_lcd_panel_ops.h" -#include "lvgl.h" +#include +#include +#include +#include #include diff --git a/main/FirmwareUpgrade.cc b/main/FirmwareUpgrade.cc index 9d4a3ed..56f67ab 100644 --- a/main/FirmwareUpgrade.cc +++ b/main/FirmwareUpgrade.cc @@ -1,12 +1,12 @@ #include "FirmwareUpgrade.h" #include "SystemInfo.h" -#include "cJSON.h" -#include "esp_log.h" -#include "esp_partition.h" -#include "esp_http_client.h" -#include "esp_ota_ops.h" -#include "esp_app_format.h" -#include "Ml307Http.h" +#include +#include +#include +#include +#include +#include + #include #include #include diff --git a/main/FirmwareUpgrade.h b/main/FirmwareUpgrade.h index 07dc99f..1fe86c8 100644 --- a/main/FirmwareUpgrade.h +++ b/main/FirmwareUpgrade.h @@ -4,7 +4,8 @@ #include #include #include -#include "Http.h" + +#include class FirmwareUpgrade { public: diff --git a/main/Kconfig.projbuild b/main/Kconfig.projbuild index fbed734..1597144 100644 --- a/main/Kconfig.projbuild +++ b/main/Kconfig.projbuild @@ -74,6 +74,18 @@ config AUDIO_DEVICE_I2S_SPK_GPIO_WS help GPIO number of the I2S MIC WS. +config BOOT_BUTTON_GPIO + int "Boot Button GPIO" + default 0 + help + GPIO number of the boot button. + +config USE_AFE_SR + bool "Use Espressif AFE SR" + default y + help + Use AFE SR for wake word detection. + config USE_ML307 bool "Use ML307" default n diff --git a/main/SystemInfo.cc b/main/SystemInfo.cc index 684f519..f62393f 100644 --- a/main/SystemInfo.cc +++ b/main/SystemInfo.cc @@ -1,15 +1,13 @@ #include "SystemInfo.h" -#include "freertos/task.h" -#include "esp_log.h" -#include "esp_flash.h" -#include "esp_mac.h" -#include "esp_chip_info.h" -#include "esp_system.h" -#include "esp_partition.h" -#include "esp_app_desc.h" -#include "esp_psram.h" -#include "esp_wifi.h" -#include "esp_ota_ops.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include #define TAG "SystemInfo" @@ -80,7 +78,6 @@ std::string SystemInfo::GetJsonString() { */ std::string json = "{"; json += "\"flash_size\":" + std::to_string(GetFlashSize()) + ","; - json += "\"psram_size\":" + std::to_string(esp_psram_get_size()) + ","; json += "\"minimum_free_heap_size\":" + std::to_string(GetMinimumFreeHeapSize()) + ","; json += "\"mac_address\":\"" + GetMacAddress() + "\","; json += "\"chip_model_name\":\"" + GetChipModelName() + "\","; diff --git a/main/SystemInfo.h b/main/SystemInfo.h index b0744d2..a750bcf 100644 --- a/main/SystemInfo.h +++ b/main/SystemInfo.h @@ -3,8 +3,8 @@ #include -#include "esp_err.h" -#include "freertos/FreeRTOS.h" +#include +#include class SystemInfo { public: diff --git a/main/SystemReset.cc b/main/SystemReset.cc index 038915c..94d2017 100644 --- a/main/SystemReset.cc +++ b/main/SystemReset.cc @@ -1,10 +1,10 @@ #include "SystemReset.h" -#include "esp_log.h" -#include "nvs_flash.h" -#include "driver/gpio.h" -#include "esp_partition.h" -#include "esp_system.h" -#include "freertos/FreeRTOS.h" +#include +#include +#include +#include +#include +#include #define TAG "SystemReset" diff --git a/main/WakeWordDetect.cc b/main/WakeWordDetect.cc new file mode 100644 index 0000000..83282bc --- /dev/null +++ b/main/WakeWordDetect.cc @@ -0,0 +1,203 @@ +#include +#include + +#include "WakeWordDetect.h" +#include "Application.h" + +#define DETECTION_RUNNING_EVENT 1 +#define WAKE_WORD_ENCODED_EVENT 2 + +static const char* TAG = "WakeWordDetect"; + +WakeWordDetect::WakeWordDetect() + : afe_detection_data_(nullptr), + wake_word_pcm_(), + wake_word_opus_() { + + event_group_ = xEventGroupCreate(); + + srmodel_list_t *models = esp_srmodel_init("model"); + for (int i = 0; i < models->num; i++) { + ESP_LOGI(TAG, "Model %d: %s", i, models->model_name[i]); + if (strstr(models->model_name[i], ESP_WN_PREFIX) != NULL) { + wakenet_model_ = models->model_name[i]; + } + } + + afe_config_t afe_config = { + .aec_init = false, + .se_init = true, + .vad_init = true, + .wakenet_init = true, + .voice_communication_init = false, + .voice_communication_agc_init = false, + .voice_communication_agc_gain = 10, + .vad_mode = VAD_MODE_3, + .wakenet_model_name = wakenet_model_, + .wakenet_model_name_2 = NULL, + .wakenet_mode = DET_MODE_90, + .afe_mode = SR_MODE_HIGH_PERF, + .afe_perferred_core = 0, + .afe_perferred_priority = 5, + .afe_ringbuf_size = 50, + .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, + .afe_linear_gain = 1.0, + .agc_mode = AFE_MN_PEAK_AGC_MODE_2, + .pcm_config = { + .total_ch_num = 1, + .mic_num = 1, + .ref_num = 0, + .sample_rate = CONFIG_AUDIO_INPUT_SAMPLE_RATE + }, + .debug_init = false, + .debug_hook = {{ AFE_DEBUG_HOOK_MASE_TASK_IN, NULL }, { AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL }}, + .afe_ns_mode = NS_MODE_SSP, + .afe_ns_model_name = NULL, + .fixed_first_channel = true, + }; + + afe_detection_data_ = esp_afe_sr_v1.create_from_config(&afe_config); + + xTaskCreate([](void* arg) { + auto this_ = (WakeWordDetect*)arg; + this_->AudioDetectionTask(); + vTaskDelete(NULL); + }, "audio_detection", 4096 * 2, this, 5, NULL); +} + +WakeWordDetect::~WakeWordDetect() { + if (afe_detection_data_ != nullptr) { + esp_afe_sr_v1.destroy(afe_detection_data_); + } + + if (wake_word_encode_task_stack_ != nullptr) { + free(wake_word_encode_task_stack_); + } + + vEventGroupDelete(event_group_); +} + +void WakeWordDetect::OnWakeWordDetected(std::function callback) { + wake_word_detected_callback_ = callback; +} + +void WakeWordDetect::OnVadStateChange(std::function callback) { + vad_state_change_callback_ = callback; +} + +void WakeWordDetect::StartDetection() { + xEventGroupSetBits(event_group_, DETECTION_RUNNING_EVENT); +} + +void WakeWordDetect::StopDetection() { + xEventGroupClearBits(event_group_, DETECTION_RUNNING_EVENT); +} + +bool WakeWordDetect::IsDetectionRunning() { + return xEventGroupGetBits(event_group_) & DETECTION_RUNNING_EVENT; +} + +void WakeWordDetect::Feed(const int16_t* data, int size) { + input_buffer_.insert(input_buffer_.end(), data, data + size); + + auto chunk_size = esp_afe_sr_v1.get_feed_chunksize(afe_detection_data_); + while (input_buffer_.size() >= chunk_size) { + esp_afe_sr_v1.feed(afe_detection_data_, input_buffer_.data()); + input_buffer_.erase(input_buffer_.begin(), input_buffer_.begin() + chunk_size); + } +} + +void WakeWordDetect::AudioDetectionTask() { + auto chunk_size = esp_afe_sr_v1.get_fetch_chunksize(afe_detection_data_); + ESP_LOGI(TAG, "Audio detection task started, chunk size: %d", chunk_size); + + while (true) { + xEventGroupWaitBits(event_group_, DETECTION_RUNNING_EVENT, pdFALSE, pdTRUE, portMAX_DELAY); + + auto res = esp_afe_sr_v1.fetch(afe_detection_data_); + if (res == nullptr || res->ret_value == ESP_FAIL) { + if (res != nullptr) { + ESP_LOGI(TAG, "Error code: %d", res->ret_value); + } + continue;; + } + + // Store the wake word data for voice recognition, like who is speaking + StoreWakeWordData((uint16_t*)res->data, res->data_size / sizeof(uint16_t)); + + // VAD state change + if (vad_state_change_callback_) { + if (res->vad_state == AFE_VAD_SPEECH && !is_speaking_) { + is_speaking_ = true; + vad_state_change_callback_(true); + } else if (res->vad_state == AFE_VAD_SILENCE && is_speaking_) { + is_speaking_ = false; + vad_state_change_callback_(false); + } + } + + if (res->wakeup_state == WAKENET_DETECTED) { + ESP_LOGI(TAG, "Wake word detected"); + StopDetection(); + + if (wake_word_detected_callback_) { + wake_word_detected_callback_(); + } + } + } +} + +void WakeWordDetect::StoreWakeWordData(uint16_t* data, size_t samples) { + // store audio data to wake_word_pcm_ + std::vector pcm(data, data + samples); + wake_word_pcm_.emplace_back(std::move(pcm)); + // keep about 2 seconds of data, detect duration is 32ms (sample_rate == 16000, chunksize == 512) + while (wake_word_pcm_.size() > 2000 / 32) { + wake_word_pcm_.pop_front(); + } +} + +void WakeWordDetect::EncodeWakeWordData() { + if (wake_word_encode_task_stack_ == nullptr) { + wake_word_encode_task_stack_ = (StackType_t*)malloc(4096 * 8); + } + wake_word_encode_task_ = xTaskCreateStatic([](void* arg) { + auto this_ = (WakeWordDetect*)arg; + auto start_time = esp_timer_get_time(); + // encode detect packets + OpusEncoder* encoder = new OpusEncoder(); + encoder->Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, 1, 60); + encoder->SetComplexity(0); + this_->wake_word_opus_.resize(4096 * 4); + size_t offset = 0; + + for (auto& pcm: this_->wake_word_pcm_) { + encoder->Encode(pcm, [this_, &offset](const uint8_t* opus, size_t opus_size) { + size_t protocol_size = sizeof(BinaryProtocol) + opus_size; + if (offset + protocol_size < this_->wake_word_opus_.size()) { + auto protocol = (BinaryProtocol*)(&this_->wake_word_opus_[offset]); + protocol->version = htons(PROTOCOL_VERSION); + protocol->type = htons(0); + protocol->reserved = 0; + protocol->timestamp = 0; + protocol->payload_size = htonl(opus_size); + memcpy(protocol->payload, opus, opus_size); + offset += protocol_size; + } + }); + } + this_->wake_word_pcm_.clear(); + this_->wake_word_opus_.resize(offset); + + auto end_time = esp_timer_get_time(); + ESP_LOGI(TAG, "Encode wake word opus: %zu bytes in %lld ms", this_->wake_word_opus_.size(), (end_time - start_time) / 1000); + xEventGroupSetBits(this_->event_group_, WAKE_WORD_ENCODED_EVENT); + delete encoder; + vTaskDelete(NULL); + }, "encode_detect_packets", 4096 * 8, this, 1, wake_word_encode_task_stack_, &wake_word_encode_task_buffer_); +} + +const std::string&& WakeWordDetect::GetWakeWordStream() { + xEventGroupWaitBits(event_group_, WAKE_WORD_ENCODED_EVENT, pdTRUE, pdTRUE, portMAX_DELAY); + return std::move(wake_word_opus_); +} diff --git a/main/WakeWordDetect.h b/main/WakeWordDetect.h new file mode 100644 index 0000000..81bf34c --- /dev/null +++ b/main/WakeWordDetect.h @@ -0,0 +1,50 @@ +#ifndef WAKE_WORD_DETECT_H +#define WAKE_WORD_DETECT_H + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + + +class WakeWordDetect { +public: + WakeWordDetect(); + ~WakeWordDetect(); + + void Feed(const int16_t* data, int size); + void OnWakeWordDetected(std::function callback); + void OnVadStateChange(std::function callback); + void StartDetection(); + void StopDetection(); + bool IsDetectionRunning(); + void EncodeWakeWordData(); + const std::string&& GetWakeWordStream(); + +private: + esp_afe_sr_data_t* afe_detection_data_ = nullptr; + char* wakenet_model_ = NULL; + std::vector input_buffer_; + EventGroupHandle_t event_group_; + std::function wake_word_detected_callback_; + std::function vad_state_change_callback_; + bool is_speaking_ = false; + + TaskHandle_t wake_word_encode_task_ = nullptr; + StaticTask_t wake_word_encode_task_buffer_; + StackType_t* wake_word_encode_task_stack_ = nullptr; + std::list> wake_word_pcm_; + std::string wake_word_opus_; + + void StoreWakeWordData(uint16_t* data, size_t size); + void AudioDetectionTask(); +}; + +#endif diff --git a/main/idf_component.yml b/main/idf_component.yml index b85c6d5..31dd003 100644 --- a/main/idf_component.yml +++ b/main/idf_component.yml @@ -1,10 +1,11 @@ ## IDF Component Manager Manifest File dependencies: - 78/esp-builtin-led: "^1.0.1" + 78/esp-builtin-led: "^1.0.2" 78/esp-wifi-connect: "^1.1.0" - 78/esp-opus-encoder: "^1.0.1" - 78/esp-ml307: "^1.1.0" + 78/esp-opus-encoder: "^1.0.2" + 78/esp-ml307: "^1.1.1" espressif/esp-sr: "^1.9.0" + espressif/button: "^3.3.1" lvgl/lvgl: "^8.4.0" esp_lvgl_port: "^1.4.0" ## Required IDF version diff --git a/main/main.cc b/main/main.cc index 9ea9b5e..4c60035 100755 --- a/main/main.cc +++ b/main/main.cc @@ -1,18 +1,15 @@ -#include - -#include "esp_log.h" -#include "esp_err.h" -#include "nvs.h" -#include "nvs_flash.h" -#include "driver/gpio.h" -#include "esp_event.h" +#include +#include +#include +#include +#include +#include #include "Application.h" #include "SystemInfo.h" #include "SystemReset.h" #define TAG "main" -#define STATS_TICKS pdMS_TO_TICKS(1000) extern "C" void app_main(void) { @@ -37,8 +34,9 @@ extern "C" void app_main(void) // Dump CPU usage every 10 second while (true) { vTaskDelay(10000 / portTICK_PERIOD_MS); - // SystemInfo::PrintRealTimeStats(STATS_TICKS); - int free_sram = heap_caps_get_minimum_free_size(MALLOC_CAP_INTERNAL); - ESP_LOGI(TAG, "Free heap size: %u minimal internal: %u", SystemInfo::GetFreeHeapSize(), free_sram); + // SystemInfo::PrintRealTimeStats(pdMS_TO_TICKS(1000)); + int free_sram = heap_caps_get_free_size(MALLOC_CAP_INTERNAL); + int min_free_sram = heap_caps_get_minimum_free_size(MALLOC_CAP_INTERNAL); + ESP_LOGI(TAG, "Free internal: %u minimal internal: %u", free_sram, min_free_sram); } } diff --git a/partitions_4M.csv b/partitions_4M.csv new file mode 100644 index 0000000..101349f --- /dev/null +++ b/partitions_4M.csv @@ -0,0 +1,7 @@ +# ESP-IDF Partition Table +# Name, Type, SubType, Offset, Size, Flags +nvs, data, nvs, 0x9000, 0x4000, +otadata, data, ota, 0xd000, 0x2000, +phy_init, data, phy, 0xf000, 0x1000, +model, data, spiffs, 0x10000, 0xF0000, +factory, app, factory, 0x100000, 3M, diff --git a/sdkconfig.defaults b/sdkconfig.defaults index 9f69ed5..46f0bf7 100644 --- a/sdkconfig.defaults +++ b/sdkconfig.defaults @@ -3,23 +3,11 @@ CONFIG_BOOTLOADER_LOG_LEVEL_NONE=y CONFIG_BOOTLOADER_SKIP_VALIDATE_ALWAYS=y CONFIG_BOOTLOADER_APP_ROLLBACK_ENABLE=y -CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y - -CONFIG_SPIRAM=y -CONFIG_SPIRAM_MODE_OCT=y -CONFIG_SPIRAM_SPEED_80M=y -CONFIG_SPIRAM_MALLOC_ALWAYSINTERNAL=4096 -CONFIG_SPIRAM_TRY_ALLOCATE_WIFI_LWIP=y -CONFIG_SPIRAM_MALLOC_RESERVE_INTERNAL=32768 -CONFIG_SPIRAM_MEMTEST=n -CONFIG_MBEDTLS_EXTERNAL_MEM_ALLOC=y - CONFIG_HTTPD_MAX_REQ_HDR_LEN=2048 CONFIG_HTTPD_MAX_URI_LEN=2048 CONFIG_PARTITION_TABLE_CUSTOM=y CONFIG_PARTITION_TABLE_CUSTOM_FILENAME="partitions.csv" -CONFIG_PARTITION_TABLE_FILENAME="partitions.csv" CONFIG_PARTITION_TABLE_OFFSET=0x8000 CONFIG_USE_WAKENET=y diff --git a/sdkconfig.defaults.esp32c3 b/sdkconfig.defaults.esp32c3 new file mode 100644 index 0000000..dc2584f --- /dev/null +++ b/sdkconfig.defaults.esp32c3 @@ -0,0 +1,6 @@ + +CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y + +CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_PARTITION_TABLE_CUSTOM_FILENAME="partitions_4M.csv" +CONFIG_PARTITION_TABLE_OFFSET=0x8000 \ No newline at end of file diff --git a/sdkconfig.defaults.esp32s3 b/sdkconfig.defaults.esp32s3 index 9f7a8f7..c2d0bfb 100644 --- a/sdkconfig.defaults.esp32s3 +++ b/sdkconfig.defaults.esp32s3 @@ -2,6 +2,17 @@ CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_ESPTOOLPY_FLASHMODE_QIO=y +CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y + +CONFIG_SPIRAM=y +CONFIG_SPIRAM_MODE_OCT=y +CONFIG_SPIRAM_SPEED_80M=y +CONFIG_SPIRAM_MALLOC_ALWAYSINTERNAL=4096 +CONFIG_SPIRAM_TRY_ALLOCATE_WIFI_LWIP=y +CONFIG_SPIRAM_MALLOC_RESERVE_INTERNAL=32768 +CONFIG_SPIRAM_MEMTEST=n +CONFIG_MBEDTLS_EXTERNAL_MEM_ALLOC=y + CONFIG_ESP32S3_INSTRUCTION_CACHE_32KB=y CONFIG_ESP32S3_DATA_CACHE_64KB=y -CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y \ No newline at end of file +CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y