From 9c9c0316bc6881f5170c1d8ceaaf984551094db7 Mon Sep 17 00:00:00 2001 From: Stephan Rotolante Date: Mon, 21 Oct 2024 23:41:27 -0400 Subject: [PATCH] Initial work for h264 implementation New functionality includes video capture, h264 encoding, and publishing --- CMakeLists.txt | 1 + src/idf_component.yml | 2 + src/main.cpp | 18 +++++ src/main.h | 6 +- src/media.cpp | 161 ++++++++++++++++++++++++++++++++++++++++++ src/webrtc.cpp | 14 +++- src/websocket.cpp | 9 ++- 7 files changed, 206 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a84a74..5e53170 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.19) # Audio Sending is implemented, but not performant enough yet add_compile_definitions(SEND_AUDIO=0) +add_compile_definitions(SEND_VIDEO=1) if(NOT IDF_TARGET STREQUAL linux) if(NOT DEFINED ENV{WIFI_SSID} OR NOT DEFINED ENV{WIFI_PASSWORD}) diff --git a/src/idf_component.yml b/src/idf_component.yml index a494de4..76584d2 100644 --- a/src/idf_component.yml +++ b/src/idf_component.yml @@ -1,3 +1,5 @@ dependencies: + espressif/esp32-camera: "^2.0.12" + espressif/esp_h264: "^1.0.4" idf: version: ">=4.1.0" diff --git a/src/main.cpp b/src/main.cpp index 4f4de74..c10a399 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -5,6 +5,8 @@ #include #ifndef LINUX_BUILD +#include + #include "nvs_flash.h" extern "C" void app_main(void) { @@ -18,8 +20,24 @@ extern "C" void app_main(void) { ESP_ERROR_CHECK(esp_event_loop_create_default()); peer_init(); + +#if SEND_AUDIO lk_init_audio_capture(); lk_init_audio_decoder(); +#endif + +#ifdef SEND_VIDEO + if (lk_init_video_capture() != ESP_OK) { + printf("Camera Init Failed\n"); + return; + } + + if (lk_init_video_encoder() != ESP_H264_ERR_OK) { + printf("Video Encoder failed to start\n"); + return; + } +#endif + lk_wifi(); lk_websocket(LIVEKIT_URL, LIVEKIT_TOKEN); } diff --git a/src/main.h b/src/main.h index efc5d4f..33e6959 100644 --- a/src/main.h +++ b/src/main.h @@ -9,10 +9,14 @@ void lk_websocket(const char *url, const char *token); void lk_wifi(void); void lk_init_audio_capture(void); void lk_init_audio_decoder(void); -void lk_populate_answer(char *answer, size_t answer_size, int include_audio); +void lk_populate_answer(char *answer, size_t answer_size, int include_media); void lk_publisher_peer_connection_task(void *user_data); void lk_subscriber_peer_connection_task(void *user_data); void lk_audio_encoder_task(void *arg); void lk_audio_decode(uint8_t *data, size_t size); void lk_init_audio_encoder(); void lk_send_audio(PeerConnection *peer_connection); + +void lk_send_video(PeerConnection *peer_connection); +int lk_init_video_capture(void); +int lk_init_video_encoder(void); \ No newline at end of file diff --git a/src/media.cpp b/src/media.cpp index da6e3bb..8bb5856 100644 --- a/src/media.cpp +++ b/src/media.cpp @@ -1,4 +1,10 @@ #include +#include +#include +#include +#include +#include +#include #include #include "main.h" @@ -16,6 +22,59 @@ #define OPUS_ENCODER_BITRATE 30000 #define OPUS_ENCODER_COMPLEXITY 0 +#define CAM_PIN_PWDN -1 +#define CAM_PIN_RESET -1 +#define CAM_PIN_XCLK 15 +#define CAM_PIN_SIOD 4 +#define CAM_PIN_SIOC 5 +#define CAM_PIN_D7 16 +#define CAM_PIN_D6 17 +#define CAM_PIN_D5 18 +#define CAM_PIN_D4 12 +#define CAM_PIN_D3 10 +#define CAM_PIN_D2 8 +#define CAM_PIN_D1 9 +#define CAM_PIN_D0 11 +#define CAM_PIN_VSYNC 6 +#define CAM_PIN_HREF 7 +#define CAM_PIN_PCLK 13 + +void *esp_h264_aligned_calloc(uint32_t alignment, uint32_t n, uint32_t size, + uint32_t *actual_size, uint32_t caps) { + *actual_size = ALIGN_UP(n * size, alignment); + void *out_ptr = heap_caps_aligned_calloc((size_t)alignment, (size_t)n, + (size_t)size, caps); + return out_ptr; +} + +/* + * SPDX-FileCopyrightText: 2021 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include "sdkconfig.h" + +#ifdef CONFIG_FREERTOS_UNICORE +#define CPU_NUM 1 +#else +#define CPU_NUM CONFIG_SOC_CPU_CORES_NUM +#endif + +long sysconf(int arg) { + switch (arg) { + case _SC_NPROCESSORS_CONF: + case _SC_NPROCESSORS_ONLN: + return CPU_NUM; + default: + errno = EINVAL; + return -1; + } +} + void lk_init_audio_capture() { i2s_config_t i2s_config_out = { .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX), @@ -104,6 +163,7 @@ void lk_audio_decode(uint8_t *data, size_t size) { OpusEncoder *opus_encoder = NULL; opus_int16 *encoder_input_buffer = NULL; uint8_t *encoder_output_buffer = NULL; +esp_h264_enc_handle_t h264_encoder = NULL; void lk_init_audio_encoder() { int encoder_error; @@ -140,3 +200,104 @@ void lk_send_audio(PeerConnection *peer_connection) { peer_connection_send_audio(peer_connection, encoder_output_buffer, encoded_size); } + +camera_fb_t *fb = NULL; +esp_h264_enc_in_frame_t in_frame; +int ret; +void lk_send_video(PeerConnection *peer_connection) { + fb = esp_camera_fb_get(); + + if (!fb) { + printf("Camera capture failed\n"); + esp_camera_fb_return(fb); + return; + } + + esp_h264_enc_out_frame_t out_frame; + + in_frame.raw_data.len = fb->len; + in_frame.raw_data.buffer = fb->buf; + + out_frame.raw_data.len = fb->width * fb->height * 2; + out_frame.raw_data.buffer = (uint8_t *)esp_h264_aligned_calloc( + 16, 1, out_frame.raw_data.len, &out_frame.raw_data.len, + MALLOC_CAP_SPIRAM); + + if ((ret = esp_h264_enc_process(h264_encoder, &in_frame, &out_frame)) != + ESP_H264_ERR_OK) { + printf("failed to encode %d\n", ret); + heap_caps_free(out_frame.raw_data.buffer); + esp_camera_fb_return(fb); + return; + } + + if ((ret = peer_connection_send_video(peer_connection, + (uint8_t *)out_frame.raw_data.buffer, + (int)out_frame.length)) < 1) { + printf("failed to send video %d\n", ret); + } + heap_caps_free(out_frame.raw_data.buffer); + esp_camera_fb_return(fb); +} + +int lk_init_video_encoder() { + esp_h264_enc_cfg_sw_t cfg; + cfg.gop = 20; + cfg.fps = 20; + cfg.res.width = 96; + cfg.res.height = 96; + cfg.rc.bitrate = cfg.res.width * cfg.res.height * cfg.fps / 20; + cfg.rc.qp_min = 10; + cfg.rc.qp_max = 10; + cfg.pic_type = ESP_H264_RAW_FMT_YUYV; + + int ret; + + if ((ret = esp_h264_enc_sw_new(&cfg, &h264_encoder)) != ESP_H264_ERR_OK) { + return ret; + } + + if ((ret = esp_h264_enc_open(h264_encoder)) != ESP_H264_ERR_OK) { + return ret; + } + + return ESP_H264_ERR_OK; +} + +int lk_init_video_capture() { + static camera_config_t camera_config = { + .pin_pwdn = CAM_PIN_PWDN, + .pin_reset = CAM_PIN_RESET, + + .pin_xclk = CAM_PIN_XCLK, + + .pin_sccb_sda = CAM_PIN_SIOD, + .pin_sccb_scl = CAM_PIN_SIOC, + + .pin_d7 = CAM_PIN_D7, + .pin_d6 = CAM_PIN_D6, + .pin_d5 = CAM_PIN_D5, + .pin_d4 = CAM_PIN_D4, + .pin_d3 = CAM_PIN_D3, + .pin_d2 = CAM_PIN_D2, + .pin_d1 = CAM_PIN_D1, + .pin_d0 = CAM_PIN_D0, + + .pin_vsync = CAM_PIN_VSYNC, + .pin_href = CAM_PIN_HREF, + + .pin_pclk = CAM_PIN_PCLK, + + .xclk_freq_hz = 16000000, + + .ledc_timer = LEDC_TIMER_0, + .ledc_channel = LEDC_CHANNEL_0, + + .pixel_format = PIXFORMAT_YUV422, // PIXFORMAT_YUV422, + .frame_size = FRAMESIZE_96X96, + .jpeg_quality = 10, + .fb_count = 2, + .grab_mode = CAMERA_GRAB_WHEN_EMPTY}; + + return esp_camera_init(&camera_config); +} \ No newline at end of file diff --git a/src/webrtc.cpp b/src/webrtc.cpp index 0321583..5b0d181 100644 --- a/src/webrtc.cpp +++ b/src/webrtc.cpp @@ -1,5 +1,6 @@ #ifndef LINUX_BUILD -#include +#include +#include #include #endif @@ -146,7 +147,9 @@ void lk_subscriber_peer_connection_task(void *user_data) { void lk_publisher_peer_connection_task(void *user_data) { #ifndef LINUX_BUILD +#if SEND_AUDIO lk_init_audio_encoder(); +#endif #endif while (1) { @@ -166,7 +169,12 @@ void lk_publisher_peer_connection_task(void *user_data) { } #ifndef LINUX_BUILD +#if SEND_AUDIO lk_send_audio(publisher_peer_connection); +#endif +#if SEND_VIDEO + lk_send_video(publisher_peer_connection); +#endif #endif peer_connection_loop(publisher_peer_connection); @@ -177,8 +185,8 @@ void lk_publisher_peer_connection_task(void *user_data) { PeerConnection *lk_create_peer_connection(int isPublisher) { PeerConfiguration peer_connection_config = { .ice_servers = {}, - .audio_codec = CODEC_OPUS, - .video_codec = CODEC_NONE, + .audio_codec = CODEC_NONE, + .video_codec = CODEC_H264, .datachannel = isPublisher ? DATA_CHANNEL_NONE : DATA_CHANNEL_STRING, .onaudiotrack = [](uint8_t *data, size_t size, void *userdata) -> void { #ifndef LINUX_BUILD diff --git a/src/websocket.cpp b/src/websocket.cpp index e7045a3..3d4c804 100644 --- a/src/websocket.cpp +++ b/src/websocket.cpp @@ -304,13 +304,20 @@ void lk_websocket(const char *room_url, const char *token) { while (true) { if (xSemaphoreTake(g_mutex, portMAX_DELAY) == pdTRUE) { - if (get_publisher_status() == 1 && SEND_AUDIO) { + if (get_publisher_status() == 1 && (SEND_AUDIO || SEND_VIDEO)) { Livekit__SignalRequest r = LIVEKIT__SIGNAL_REQUEST__INIT; Livekit__AddTrackRequest a = LIVEKIT__ADD_TRACK_REQUEST__INIT; +#if SEND_AUDIO a.cid = (char *)"microphone"; a.name = (char *)"microphone"; a.source = LIVEKIT__TRACK_SOURCE__MICROPHONE; +#endif +#if SEND_VIDEO + a.cid = (char *)"camera"; + a.name = (char *)"camera"; + a.source = LIVEKIT__TRACK_SOURCE__CAMERA; +#endif r.add_track = &a; r.message_case = LIVEKIT__SIGNAL_REQUEST__MESSAGE_ADD_TRACK;