From 5f8389c135f589f9a3b63fe4fd7893f72a097b03 Mon Sep 17 00:00:00 2001
From: Terrence <terrence@tenclass.com>
Date: Mon, 28 Apr 2025 16:29:33 +0800
Subject: [PATCH] Add binary protocol v2 & v3 to websocket

---
 main/ota.cc                          |  2 +
 main/protocols/mqtt_protocol.cc      |  5 ++
 main/protocols/protocol.h            | 15 ++++++
 main/protocols/websocket_protocol.cc | 76 +++++++++++++++++++++++-----
 main/protocols/websocket_protocol.h  |  1 +
 5 files changed, 86 insertions(+), 13 deletions(-)
diff --git a/main/ota.cc b/main/ota.cc
index f40335a..8240034 100644
--- a/main/ota.cc
+++ b/main/ota.cc
@@ -154,6 +154,8 @@ bool Ota::CheckVersion() {
         cJSON_ArrayForEach(item, websocket) {
             if (item->type == cJSON_String) {
                 settings.SetString(item->string, item->valuestring);
+            } else if (item->type == cJSON_Number) {
+                settings.SetInt(item->string, item->valueint);
             }
         }
         has_websocket_config_ = true;
diff --git a/main/protocols/mqtt_protocol.cc b/main/protocols/mqtt_protocol.cc
index 3cda2e7..19f94b8 100644
--- a/main/protocols/mqtt_protocol.cc
+++ b/main/protocols/mqtt_protocol.cc
@@ -207,6 +207,11 @@ bool MqttProtocol::OpenAudioChannel() {
     }
     udp_ = Board::GetInstance().CreateUdp();
     udp_->OnMessage([this](const std::string& data) {
+        /*
+         * UDP Encrypted OPUS Packet Format:
+         * |type 1u|flags 1u|payload_len 2u|ssrc 4u|timestamp 4u|sequence 4u|
+         * |payload payload_len|
+         */
         if (data.size() < sizeof(aes_nonce_)) {
             ESP_LOGE(TAG, "Invalid audio packet size: %zu", data.size());
             return;
diff --git a/main/protocols/protocol.h b/main/protocols/protocol.h
index 5fdd575..3a377f5 100644
--- a/main/protocols/protocol.h
+++ b/main/protocols/protocol.h
@@ -5,6 +5,21 @@
 #include <string>
 #include <functional>
 #include <chrono>
+#include <vector>
+
+struct AudioStreamPacket {
+    uint32_t timestamp;
+    std::vector<uint8_t> payload;
+};
+
+struct BinaryProtocol2 {
+    uint16_t version;
+    uint16_t type;          // Message type (0: OPUS, 1: JSON)
+    uint32_t reserved;      // Reserved for future use
+    uint32_t timestamp;     // Timestamp in milliseconds (used for server-side AEC)
+    uint32_t payload_size;  // Payload size in bytes
+    uint8_t payload[];      // Payload data
+} __attribute__((packed));
 
 struct BinaryProtocol3 {
     uint8_t type;
diff --git a/main/protocols/websocket_protocol.cc b/main/protocols/websocket_protocol.cc
index b08f5b6..4e48d31 100644
--- a/main/protocols/websocket_protocol.cc
+++ b/main/protocols/websocket_protocol.cc
@@ -33,9 +33,37 @@ void WebsocketProtocol::SendAudio(const std::vector<uint8_t>& data) {
         return;
     }
 
-    busy_sending_audio_ = true;
-    websocket_->Send(data.data(), data.size(), true);
-    busy_sending_audio_ = false;
+    if (version_ == 2) {
+        std::string packet;
+        packet.resize(sizeof(BinaryProtocol2) + data.size());
+        auto bp2 = (BinaryProtocol2*)packet.data();
+        bp2->version = htons(version_);
+        bp2->type = 0;
+        bp2->reserved = 0;
+        bp2->timestamp = htonl(0);
+        bp2->payload_size = htonl(data.size());
+        memcpy(bp2->payload, data.data(), data.size());
+
+        busy_sending_audio_ = true;
+        websocket_->Send(packet.data(), packet.size(), true);
+        busy_sending_audio_ = false;
+    } else if (version_ == 3) {
+        std::string packet;
+        packet.resize(sizeof(BinaryProtocol3) + data.size());
+        auto bp3 = (BinaryProtocol3*)packet.data();
+        bp3->type = 0;
+        bp3->reserved = 0;
+        bp3->payload_size = htons(data.size());
+        memcpy(bp3->payload, data.data(), data.size());
+
+        busy_sending_audio_ = true;
+        websocket_->Send(packet.data(), packet.size(), true);
+        busy_sending_audio_ = false;
+    } else {
+        busy_sending_audio_ = true;
+        websocket_->Send(data.data(), data.size(), true);
+        busy_sending_audio_ = false;
+    }
 }
 
 bool WebsocketProtocol::SendText(const std::string& text) {
@@ -71,25 +99,47 @@ bool WebsocketProtocol::OpenAudioChannel() {
     Settings settings("websocket", false);
     std::string url = settings.GetString("url");
     std::string token = settings.GetString("token");
+    int version = settings.GetInt("version");
+    if (version != 0) {
+        version_ = version;
+    }
 
     busy_sending_audio_ = false;
     error_occurred_ = false;
-    
-    // If token not starts with "Bearer " or "bearer ", add it
-    if (token.empty() || (token.find("Bearer ") != 0 && token.find("bearer ") != 0)) {
-        token = "Bearer " + token;
-    }
 
     websocket_ = Board::GetInstance().CreateWebSocket();
-    websocket_->SetHeader("Authorization", token.c_str());
-    websocket_->SetHeader("Protocol-Version", "1");
+    
+    if (!token.empty()) {
+        // If token not has a space, add "Bearer " prefix
+        if (token.find(" ") == std::string::npos) {
+            token = "Bearer " + token;
+        }
+        websocket_->SetHeader("Authorization", token.c_str());
+    }
+    websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str());
     websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
     websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
 
     websocket_->OnData([this](const char* data, size_t len, bool binary) {
         if (binary) {
             if (on_incoming_audio_ != nullptr) {
-                on_incoming_audio_(std::vector<uint8_t>((uint8_t*)data, (uint8_t*)data + len));
+                if (version_ == 2) {
+                    BinaryProtocol2* bp2 = (BinaryProtocol2*)data;
+                    bp2->version = ntohs(bp2->version);
+                    bp2->type = ntohs(bp2->type);
+                    bp2->timestamp = ntohl(bp2->timestamp);
+                    bp2->payload_size = ntohl(bp2->payload_size);
+                    auto payload = (uint8_t*)bp2->payload;
+                    on_incoming_audio_(std::vector<uint8_t>(payload, payload + bp2->payload_size));
+                } else if (version_ == 3) {
+                    BinaryProtocol3* bp3 = (BinaryProtocol3*)data;
+                    bp3->type = bp3->type;
+                    bp3->payload_size = ntohs(bp3->payload_size);
+                    auto payload = (uint8_t*)bp3->payload;
+                    on_incoming_audio_(std::vector<uint8_t>(payload, payload + bp3->payload_size));
+                } else {
+                    on_incoming_audio_(std::vector<uint8_t>((uint8_t*)data, (uint8_t*)data + len));
+                }
             }
         } else {
             // Parse JSON data
@@ -118,7 +168,7 @@ bool WebsocketProtocol::OpenAudioChannel() {
         }
     });
 
-    ESP_LOGI(TAG, "Connecting to websocket server: %s with token: %s", url.c_str(), token.c_str());
+    ESP_LOGI(TAG, "Connecting to websocket server: %s with version: %d", url.c_str(), version_);
     if (!websocket_->Connect(url.c_str())) {
         ESP_LOGE(TAG, "Failed to connect to websocket server");
         SetError(Lang::Strings::SERVER_NOT_FOUND);
@@ -129,7 +179,7 @@ bool WebsocketProtocol::OpenAudioChannel() {
     // keys: message type, version, audio_params (format, sample_rate, channels)
     std::string message = "{";
     message += "\"type\":\"hello\",";
-    message += "\"version\": 1,";
+    message += "\"version\": " + std::to_string(version_) + ",";
     message += "\"transport\":\"websocket\",";
     message += "\"audio_params\":{";
     message += "\"format\":\"opus\", \"sample_rate\":16000, \"channels\":1, \"frame_duration\":" + std::to_string(OPUS_FRAME_DURATION_MS);
diff --git a/main/protocols/websocket_protocol.h b/main/protocols/websocket_protocol.h
index 5e96083..db998cc 100644
--- a/main/protocols/websocket_protocol.h
+++ b/main/protocols/websocket_protocol.h
@@ -24,6 +24,7 @@ public:
 private:
     EventGroupHandle_t event_group_handle_;
     WebSocket* websocket_ = nullptr;
+    int version_ = 1;
 
     void ParseServerHello(const cJSON* root);
     bool SendText(const std::string& text) override;