From 5f8389c135f589f9a3b63fe4fd7893f72a097b03 Mon Sep 17 00:00:00 2001 From: Terrence Date: Mon, 28 Apr 2025 16:29:33 +0800 Subject: [PATCH] Add binary protocol v2 & v3 to websocket --- main/ota.cc | 2 + main/protocols/mqtt_protocol.cc | 5 ++ main/protocols/protocol.h | 15 ++++++ main/protocols/websocket_protocol.cc | 76 +++++++++++++++++++++++----- main/protocols/websocket_protocol.h | 1 + 5 files changed, 86 insertions(+), 13 deletions(-) diff --git a/main/ota.cc b/main/ota.cc index f40335a..8240034 100644 --- a/main/ota.cc +++ b/main/ota.cc @@ -154,6 +154,8 @@ bool Ota::CheckVersion() { cJSON_ArrayForEach(item, websocket) { if (item->type == cJSON_String) { settings.SetString(item->string, item->valuestring); + } else if (item->type == cJSON_Number) { + settings.SetInt(item->string, item->valueint); } } has_websocket_config_ = true; diff --git a/main/protocols/mqtt_protocol.cc b/main/protocols/mqtt_protocol.cc index 3cda2e7..19f94b8 100644 --- a/main/protocols/mqtt_protocol.cc +++ b/main/protocols/mqtt_protocol.cc @@ -207,6 +207,11 @@ bool MqttProtocol::OpenAudioChannel() { } udp_ = Board::GetInstance().CreateUdp(); udp_->OnMessage([this](const std::string& data) { + /* + * UDP Encrypted OPUS Packet Format: + * |type 1u|flags 1u|payload_len 2u|ssrc 4u|timestamp 4u|sequence 4u| + * |payload payload_len| + */ if (data.size() < sizeof(aes_nonce_)) { ESP_LOGE(TAG, "Invalid audio packet size: %zu", data.size()); return; diff --git a/main/protocols/protocol.h b/main/protocols/protocol.h index 5fdd575..3a377f5 100644 --- a/main/protocols/protocol.h +++ b/main/protocols/protocol.h @@ -5,6 +5,21 @@ #include #include #include +#include + +struct AudioStreamPacket { + uint32_t timestamp; + std::vector payload; +}; + +struct BinaryProtocol2 { + uint16_t version; + uint16_t type; // Message type (0: OPUS, 1: JSON) + uint32_t reserved; // Reserved for future use + uint32_t timestamp; // Timestamp in milliseconds (used for server-side AEC) + uint32_t payload_size; // Payload size in bytes + uint8_t payload[]; // Payload data +} __attribute__((packed)); struct BinaryProtocol3 { uint8_t type; diff --git a/main/protocols/websocket_protocol.cc b/main/protocols/websocket_protocol.cc index b08f5b6..4e48d31 100644 --- a/main/protocols/websocket_protocol.cc +++ b/main/protocols/websocket_protocol.cc @@ -33,9 +33,37 @@ void WebsocketProtocol::SendAudio(const std::vector& data) { return; } - busy_sending_audio_ = true; - websocket_->Send(data.data(), data.size(), true); - busy_sending_audio_ = false; + if (version_ == 2) { + std::string packet; + packet.resize(sizeof(BinaryProtocol2) + data.size()); + auto bp2 = (BinaryProtocol2*)packet.data(); + bp2->version = htons(version_); + bp2->type = 0; + bp2->reserved = 0; + bp2->timestamp = htonl(0); + bp2->payload_size = htonl(data.size()); + memcpy(bp2->payload, data.data(), data.size()); + + busy_sending_audio_ = true; + websocket_->Send(packet.data(), packet.size(), true); + busy_sending_audio_ = false; + } else if (version_ == 3) { + std::string packet; + packet.resize(sizeof(BinaryProtocol3) + data.size()); + auto bp3 = (BinaryProtocol3*)packet.data(); + bp3->type = 0; + bp3->reserved = 0; + bp3->payload_size = htons(data.size()); + memcpy(bp3->payload, data.data(), data.size()); + + busy_sending_audio_ = true; + websocket_->Send(packet.data(), packet.size(), true); + busy_sending_audio_ = false; + } else { + busy_sending_audio_ = true; + websocket_->Send(data.data(), data.size(), true); + busy_sending_audio_ = false; + } } bool WebsocketProtocol::SendText(const std::string& text) { @@ -71,25 +99,47 @@ bool WebsocketProtocol::OpenAudioChannel() { Settings settings("websocket", false); std::string url = settings.GetString("url"); std::string token = settings.GetString("token"); + int version = settings.GetInt("version"); + if (version != 0) { + version_ = version; + } busy_sending_audio_ = false; error_occurred_ = false; - - // If token not starts with "Bearer " or "bearer ", add it - if (token.empty() || (token.find("Bearer ") != 0 && token.find("bearer ") != 0)) { - token = "Bearer " + token; - } websocket_ = Board::GetInstance().CreateWebSocket(); - websocket_->SetHeader("Authorization", token.c_str()); - websocket_->SetHeader("Protocol-Version", "1"); + + if (!token.empty()) { + // If token not has a space, add "Bearer " prefix + if (token.find(" ") == std::string::npos) { + token = "Bearer " + token; + } + websocket_->SetHeader("Authorization", token.c_str()); + } + websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str()); websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str()); websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str()); websocket_->OnData([this](const char* data, size_t len, bool binary) { if (binary) { if (on_incoming_audio_ != nullptr) { - on_incoming_audio_(std::vector((uint8_t*)data, (uint8_t*)data + len)); + if (version_ == 2) { + BinaryProtocol2* bp2 = (BinaryProtocol2*)data; + bp2->version = ntohs(bp2->version); + bp2->type = ntohs(bp2->type); + bp2->timestamp = ntohl(bp2->timestamp); + bp2->payload_size = ntohl(bp2->payload_size); + auto payload = (uint8_t*)bp2->payload; + on_incoming_audio_(std::vector(payload, payload + bp2->payload_size)); + } else if (version_ == 3) { + BinaryProtocol3* bp3 = (BinaryProtocol3*)data; + bp3->type = bp3->type; + bp3->payload_size = ntohs(bp3->payload_size); + auto payload = (uint8_t*)bp3->payload; + on_incoming_audio_(std::vector(payload, payload + bp3->payload_size)); + } else { + on_incoming_audio_(std::vector((uint8_t*)data, (uint8_t*)data + len)); + } } } else { // Parse JSON data @@ -118,7 +168,7 @@ bool WebsocketProtocol::OpenAudioChannel() { } }); - ESP_LOGI(TAG, "Connecting to websocket server: %s with token: %s", url.c_str(), token.c_str()); + ESP_LOGI(TAG, "Connecting to websocket server: %s with version: %d", url.c_str(), version_); if (!websocket_->Connect(url.c_str())) { ESP_LOGE(TAG, "Failed to connect to websocket server"); SetError(Lang::Strings::SERVER_NOT_FOUND); @@ -129,7 +179,7 @@ bool WebsocketProtocol::OpenAudioChannel() { // keys: message type, version, audio_params (format, sample_rate, channels) std::string message = "{"; message += "\"type\":\"hello\","; - message += "\"version\": 1,"; + message += "\"version\": " + std::to_string(version_) + ","; message += "\"transport\":\"websocket\","; message += "\"audio_params\":{"; message += "\"format\":\"opus\", \"sample_rate\":16000, \"channels\":1, \"frame_duration\":" + std::to_string(OPUS_FRAME_DURATION_MS); diff --git a/main/protocols/websocket_protocol.h b/main/protocols/websocket_protocol.h index 5e96083..db998cc 100644 --- a/main/protocols/websocket_protocol.h +++ b/main/protocols/websocket_protocol.h @@ -24,6 +24,7 @@ public: private: EventGroupHandle_t event_group_handle_; WebSocket* websocket_ = nullptr; + int version_ = 1; void ParseServerHello(const cJSON* root); bool SendText(const std::string& text) override;