From 4161163d9c99a1cf4c4ea6f22ab32ab66a593ad4 Mon Sep 17 00:00:00 2001
From: zhupengfei <zhupf321@gmail.com>
Date: Sat, 1 Feb 2020 12:23:07 +0800
Subject: [PATCH] ffmpeg: Correctly handle sample rates

Previously, we just used the native sample rate for encoding. However, some encoders like libmp3lame doesn't support it. Therefore, we now use a supported sample rate (preferring the native one if possible).

FFmpeg requires audio data to be sent in a sequence of frames, each containing the same specific number of samples. Previously, we buffered input samples in FFmpegBackend. However, as the source and destination sample rates can now be different, we should buffer resampled data instead. swresample have an internal input buffer, so we now just forward all data to it and 'gradually' receive resampled data, at most one frame_size at a time. When there is not enough resampled data to form a frame, we will record the current offset and request for less data on the next call.

Additionally, this commit also fixes a flaw. When an encoder supports variable frame sizes, its frame size is reported to be 0, which breaks our buffering system. Now we treat variable frame size encoders as having a frame size of 160 (the size of a HLE audio frame).
---
 src/core/dumping/ffmpeg_backend.cpp | 129 ++++++++++++++++------------
 src/core/dumping/ffmpeg_backend.h   |  19 ++--
 2 files changed, 84 insertions(+), 64 deletions(-)
diff --git a/src/core/dumping/ffmpeg_backend.cpp b/src/core/dumping/ffmpeg_backend.cpp
index ef7843ffc..3c34c8440 100644
--- a/src/core/dumping/ffmpeg_backend.cpp
+++ b/src/core/dumping/ffmpeg_backend.cpp
@@ -211,7 +211,7 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) {
     if (!FFmpegStream::Init(format_context))
         return false;
 
-    sample_count = 0;
+    frame_count = 0;
 
     // Initialize audio codec
     const AVCodec* codec = avcodec_find_encoder_by_name(Settings::values.audio_encoder.c_str());
@@ -243,7 +243,20 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) {
         codec_context->sample_fmt = AV_SAMPLE_FMT_S16P;
     }
 
-    codec_context->sample_rate = AudioCore::native_sample_rate;
+    if (codec->supported_samplerates) {
+        codec_context->sample_rate = codec->supported_samplerates[0];
+        // Prefer native sample rate if supported
+        const int* ptr = codec->supported_samplerates;
+        while ((*ptr)) {
+            if ((*ptr) == AudioCore::native_sample_rate) {
+                codec_context->sample_rate = AudioCore::native_sample_rate;
+                break;
+            }
+            ptr++;
+        }
+    } else {
+        codec_context->sample_rate = AudioCore::native_sample_rate;
+    }
     codec_context->channel_layout = AV_CH_LAYOUT_STEREO;
     codec_context->channels = 2;
 
@@ -259,6 +272,12 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) {
         LOG_WARNING(Render, "Audio encoder options not found: {}", buf);
     }
 
+    if (codec_context->frame_size) {
+        frame_size = static_cast<u64>(codec_context->frame_size);
+    } else { // variable frame size support
+        frame_size = std::tuple_size<AudioCore::StereoFrame16>::value;
+    }
+
     // Create audio stream
     stream = avformat_new_stream(format_context, codec);
     if (!stream || avcodec_parameters_from_context(stream->codecpar, codec_context.get()) < 0) {
@@ -291,7 +310,7 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) {
     // Allocate resampled data
     int error =
         av_samples_alloc_array_and_samples(&resampled_data, nullptr, codec_context->channels,
-                                           codec_context->frame_size, codec_context->sample_fmt, 0);
+                                           frame_size, codec_context->sample_fmt, 0);
     if (error < 0) {
         LOG_ERROR(Render, "Could not allocate samples storage");
         return false;
@@ -312,31 +331,62 @@ void FFmpegAudioStream::Free() {
     av_freep(&resampled_data);
 }
 
-void FFmpegAudioStream::ProcessFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1) {
+void FFmpegAudioStream::ProcessFrame(const VariableAudioFrame& channel0,
+                                     const VariableAudioFrame& channel1) {
     ASSERT_MSG(channel0.size() == channel1.size(),
                "Frames of the two channels must have the same number of samples");
-    std::array<const u8*, 2> src_data = {reinterpret_cast<u8*>(channel0.data()),
-                                         reinterpret_cast<u8*>(channel1.data())};
-    if (swr_convert(swr_context.get(), resampled_data, channel0.size(), src_data.data(),
-                    channel0.size()) < 0) {
 
+    const auto sample_size = av_get_bytes_per_sample(codec_context->sample_fmt);
+    std::array<const u8*, 2> src_data = {reinterpret_cast<const u8*>(channel0.data()),
+                                         reinterpret_cast<const u8*>(channel1.data())};
+    std::array<u8*, 2> dst_data = {resampled_data[0] + sample_size * offset,
+                                   resampled_data[1] + sample_size * offset};
+
+    auto resampled_count = swr_convert(swr_context.get(), dst_data.data(), frame_size - offset,
+                                       src_data.data(), channel0.size());
+    if (resampled_count < 0) {
         LOG_ERROR(Render, "Audio frame dropped: Could not resample data");
         return;
     }
 
-    // Prepare frame
-    audio_frame->nb_samples = channel0.size();
-    audio_frame->data[0] = resampled_data[0];
-    audio_frame->data[1] = resampled_data[1];
-    audio_frame->pts = sample_count;
-    sample_count += channel0.size();
+    offset += resampled_count;
+    if (offset < frame_size) { // Still not enough to form a frame
+        return;
+    }
 
-    SendFrame(audio_frame.get());
+    while (true) {
+        // Prepare frame
+        audio_frame->nb_samples = frame_size;
+        audio_frame->data[0] = resampled_data[0];
+        audio_frame->data[1] = resampled_data[1];
+        audio_frame->pts = frame_count * frame_size;
+        frame_count++;
+
+        SendFrame(audio_frame.get());
+
+        // swr_convert buffers input internally. Try to get more resampled data
+        resampled_count = swr_convert(swr_context.get(), resampled_data, frame_size, nullptr, 0);
+        if (resampled_count < 0) {
+            LOG_ERROR(Render, "Audio frame dropped: Could not resample data");
+            return;
+        }
+        if (static_cast<u64>(resampled_count) < frame_size) {
+            offset = resampled_count;
+            break;
+        }
+    }
 }
 
-std::size_t FFmpegAudioStream::GetAudioFrameSize() const {
-    ASSERT_MSG(codec_context, "Codec context is not initialized yet!");
-    return codec_context->frame_size;
+void FFmpegAudioStream::Flush() {
+    // Send the last samples
+    audio_frame->nb_samples = offset;
+    audio_frame->data[0] = resampled_data[0];
+    audio_frame->data[1] = resampled_data[1];
+    audio_frame->pts = frame_count * frame_size;
+
+    SendFrame(audio_frame.get());
+
+    FFmpegStream::Flush();
 }
 
 FFmpegMuxer::~FFmpegMuxer() {
@@ -402,7 +452,8 @@ void FFmpegMuxer::ProcessVideoFrame(VideoFrame& frame) {
     video_stream.ProcessFrame(frame);
 }
 
-void FFmpegMuxer::ProcessAudioFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1) {
+void FFmpegMuxer::ProcessAudioFrame(const VariableAudioFrame& channel0,
+                                    const VariableAudioFrame& channel1) {
     audio_stream.ProcessFrame(channel0, channel1);
 }
 
@@ -414,10 +465,6 @@ void FFmpegMuxer::FlushAudio() {
     audio_stream.Flush();
 }
 
-std::size_t FFmpegMuxer::GetAudioFrameSize() const {
-    return audio_stream.GetAudioFrameSize();
-}
-
 void FFmpegMuxer::WriteTrailer() {
     av_write_trailer(format_context.get());
 }
@@ -498,24 +545,20 @@ void FFmpegBackend::AddVideoFrame(VideoFrame frame) {
 }
 
 void FFmpegBackend::AddAudioFrame(AudioCore::StereoFrame16 frame) {
-    std::array<std::array<s16, 160>, 2> refactored_frame;
+    std::array<VariableAudioFrame, 2> refactored_frame;
+    for (auto& channel : refactored_frame) {
+        channel.resize(frame.size());
+    }
     for (std::size_t i = 0; i < frame.size(); i++) {
         refactored_frame[0][i] = frame[i][0];
         refactored_frame[1][i] = frame[i][1];
     }
 
-    for (auto i : {0, 1}) {
-        audio_buffers[i].insert(audio_buffers[i].end(), refactored_frame[i].begin(),
-                                refactored_frame[i].end());
-    }
-    CheckAudioBuffer();
+    ffmpeg.ProcessAudioFrame(refactored_frame[0], refactored_frame[1]);
 }
 
 void FFmpegBackend::AddAudioSample(const std::array<s16, 2>& sample) {
-    for (auto i : {0, 1}) {
-        audio_buffers[i].push_back(sample[i]);
-    }
-    CheckAudioBuffer();
+    ffmpeg.ProcessAudioFrame({sample[0]}, {sample[1]});
 }
 
 void FFmpegBackend::StopDumping() {
@@ -525,12 +568,6 @@ void FFmpegBackend::StopDumping() {
     // Flush the video processing queue
     AddVideoFrame(VideoFrame());
     for (auto i : {0, 1}) {
-        // Add remaining data to audio queue
-        if (audio_buffers[i].size() >= 0) {
-            VariableAudioFrame buffer(audio_buffers[i].begin(), audio_buffers[i].end());
-            audio_frame_queues[i].Push(std::move(buffer));
-            audio_buffers[i].clear();
-        }
         // Flush the audio processing queue
         audio_frame_queues[i].Push(VariableAudioFrame());
     }
@@ -554,18 +591,4 @@ void FFmpegBackend::EndDumping() {
     processing_ended.Set();
 }
 
-void FFmpegBackend::CheckAudioBuffer() {
-    for (auto i : {0, 1}) {
-        const std::size_t frame_size = ffmpeg.GetAudioFrameSize();
-        // Add audio data to the queue when there is enough to form a frame
-        while (audio_buffers[i].size() >= frame_size) {
-            VariableAudioFrame buffer(audio_buffers[i].begin(),
-                                      audio_buffers[i].begin() + frame_size);
-            audio_frame_queues[i].Push(std::move(buffer));
-
-            audio_buffers[i].erase(audio_buffers[i].begin(), audio_buffers[i].begin() + frame_size);
-        }
-    }
-}
-
 } // namespace VideoDumper
diff --git a/src/core/dumping/ffmpeg_backend.h b/src/core/dumping/ffmpeg_backend.h
index f08f31d3d..f0962189e 100644
--- a/src/core/dumping/ffmpeg_backend.h
+++ b/src/core/dumping/ffmpeg_backend.h
@@ -96,6 +96,7 @@ private:
 /**
  * A FFmpegStream used for audio data.
  * Resamples (converts), encodes and writes a frame.
+ * This also temporarily stores resampled audio data before there are enough to form a frame.
  */
 class FFmpegAudioStream : public FFmpegStream {
 public:
@@ -103,8 +104,8 @@ public:
 
     bool Init(AVFormatContext* format_context);
     void Free();
-    void ProcessFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1);
-    std::size_t GetAudioFrameSize() const;
+    void ProcessFrame(const VariableAudioFrame& channel0, const VariableAudioFrame& channel1);
+    void Flush();
 
 private:
     struct SwrContextDeleter {
@@ -113,12 +114,14 @@ private:
         }
     };
 
-    u64 sample_count{};
+    u64 frame_size{};
+    u64 frame_count{};
 
     std::unique_ptr<AVFrame, AVFrameDeleter> audio_frame{};
     std::unique_ptr<SwrContext, SwrContextDeleter> swr_context{};
 
     u8** resampled_data{};
+    u64 offset{}; // Number of output samples that are currently in resampled_data.
 };
 
 /**
@@ -132,10 +135,9 @@ public:
     bool Init(const std::string& path, const Layout::FramebufferLayout& layout);
     void Free();
     void ProcessVideoFrame(VideoFrame& frame);
-    void ProcessAudioFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1);
+    void ProcessAudioFrame(const VariableAudioFrame& channel0, const VariableAudioFrame& channel1);
     void FlushVideo();
     void FlushAudio();
-    std::size_t GetAudioFrameSize() const;
     void WriteTrailer();
 
 private:
@@ -153,8 +155,7 @@ private:
 
 /**
  * FFmpeg video dumping backend.
- * This class implements a double buffer, and an audio queue to keep audio data
- * before enough data is received to form a frame.
+ * This class implements a double buffer.
  */
 class FFmpegBackend : public Backend {
 public:
@@ -169,7 +170,6 @@ public:
     Layout::FramebufferLayout GetLayout() const override;
 
 private:
-    void CheckAudioBuffer();
     void EndDumping();
 
     std::atomic_bool is_dumping = false; ///< Whether the backend is currently dumping
@@ -182,9 +182,6 @@ private:
     Common::Event event1, event2;
     std::thread video_processing_thread;
 
-    /// An audio buffer used to temporarily hold audio data, before the size is big enough
-    /// to be sent to the encoder as a frame
-    std::array<VariableAudioFrame, 2> audio_buffers;
     std::array<Common::SPSCQueue<VariableAudioFrame>, 2> audio_frame_queues;
     std::thread audio_processing_thread;