Rewrite transcript logic to be more generic (#4747)

The transcript logic in Invidious was written specifically as a workaround for captions, and not transcripts as a feature. This PR genericises the logic as so it can be used to implement transcripts within Invidious. The most notable change is the added parsing of section headings when it was previously skipped over in favor of regular lines.
2024-12-31 12:20:17 +05:30 · 2024-07-10 22:14:56 +02:00 · 2024-07-10 22:14:56 +02:00 · a56a724a55
commit a56a724a55
parent 0a54e26536 f466116cd7
2 changed files with 90 additions and 36 deletions
--- a/src/invidious/routes/api/v1/videos.cr
+++ b/src/invidious/routes/api/v1/videos.cr
@ -89,9 +89,14 @@ module Invidious::Routes::API::V1::Videos
    if CONFIG.use_innertube_for_captions
      params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated)
      initial_data = YoutubeAPI.get_transcript(params)
-      webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code)
+      transcript = Invidious::Videos::Transcript.from_raw(
        YoutubeAPI.get_transcript(params),
        caption.language_code,
        caption.auto_generated
      )
      webvtt = transcript.to_vtt
    else
      # Timedtext API handling
      url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target
--- a/src/invidious/videos/transcript.cr
+++ b/src/invidious/videos/transcript.cr
@ -1,8 +1,26 @@
 module Invidious::Videos
-  # Namespace for methods primarily relating to Transcripts
+  # A `Transcripts` struct encapsulates a sequence of lines that together forms the whole transcript for a given YouTube video.
-  module Transcript
+  # These lines can be categorized into two types: section headings and regular lines representing content from the video.
-    record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String
+  struct Transcript
    # Types
    record HeadingLine, start_ms : Time::Span, end_ms : Time::Span, line : String
    record RegularLine, start_ms : Time::Span, end_ms : Time::Span, line : String
    alias TranscriptLine = HeadingLine | RegularLine
    property lines : Array(TranscriptLine)
    property language_code : String
    property auto_generated : Bool
    # User friendly label for the current transcript.
    # Example: "English (auto-generated)"
    property label : String
    # Initializes a new Transcript struct with the contents and associated metadata describing it
    def initialize(@lines : Array(TranscriptLine), @language_code : String, @auto_generated : Bool, @label : String)
    end
    # Generates a protobuf string to fetch the requested transcript from YouTube
    def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String
      kind = auto_generated ? "asr" : ""
@ -30,48 +48,79 @@ module Invidious::Videos
      return params
    end
-    def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String
+    # Constructs a Transcripts struct from the initial YouTube response
-      # Convert into array of TranscriptLine
+    def self.from_raw(initial_data : Hash(String, JSON::Any), language_code : String, auto_generated : Bool)
-      lines = self.parse(initial_data)
+      transcript_panel = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
        "content", "transcriptSearchPanelRenderer")
      segment_list = transcript_panel.dig("body", "transcriptSegmentListRenderer")
      if !segment_list["initialSegments"]?
        raise NotFoundException.new("Requested transcript does not exist")
      end
      # Extract user-friendly label for the current transcript
      footer_language_menu = transcript_panel.dig?(
        "footer", "transcriptFooterRenderer", "languageMenu", "sortFilterSubMenuRenderer", "subMenuItems"
      )
      if footer_language_menu
        label = footer_language_menu.as_a.select(&.["selected"].as_bool)[0]["title"].as_s
      else
        label = language_code
      end
      # Extract transcript lines
      initial_segments = segment_list["initialSegments"].as_a
      lines = [] of TranscriptLine
      initial_segments.each do |line|
        if unpacked_line = line["transcriptSectionHeaderRenderer"]?
          line_type = HeadingLine
        else
          unpacked_line = line["transcriptSegmentRenderer"]
          line_type = RegularLine
        end
        start_ms = unpacked_line["startMs"].as_s.to_i.millisecond
        end_ms = unpacked_line["endMs"].as_s.to_i.millisecond
        text = extract_text(unpacked_line["snippet"]) || ""
        lines << line_type.new(start_ms, end_ms, text)
      end
      return Transcript.new(
        lines: lines,
        language_code: language_code,
        auto_generated: auto_generated,
        label: label
      )
    end
    # Converts transcript lines to a WebVTT file
    #
    # This is used within Invidious to replace subtitles
    # as to workaround YouTube's rate-limited timedtext endpoint.
    def to_vtt
      settings_field = {
        "Kind"     => "captions",
-        "Language" => target_language,
+        "Language" => @language_code,
      }
      # Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt()
      vtt = WebVTT.build(settings_field) do |vtt|
-        lines.each do |line|
+        @lines.each do |line|
          # Section headers are excluded from the VTT conversion as to
          # match the regular captions returned from YouTube as much as possible
          next if line.is_a? HeadingLine
          vtt.cue(line.start_ms, line.end_ms, line.line)
        end
      end
      return vtt
    end
    private def self.parse(initial_data : Hash(String, JSON::Any))
      body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
        "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer",
        "initialSegments").as_a
      lines = [] of TranscriptLine
      body.each do |line|
        # Transcript section headers. They are not apart of the captions and as such we can safely skip them.
        if line.as_h.has_key?("transcriptSectionHeaderRenderer")
          next
        end
        line = line["transcriptSegmentRenderer"]
        start_ms = line["startMs"].as_s.to_i.millisecond
        end_ms = line["endMs"].as_s.to_i.millisecond
        text = extract_text(line["snippet"]) || ""
        lines << TranscriptLine.new(start_ms, end_ms, text)
      end
      return lines
    end
  end
 end