Use new youtube API to fetch channel videos (#1355)

* Use new API to fetch videos from channels

This mirrors the process used by subscriptions.gir.st. The old API is
tried first, and if it fails then the new one is used.

* Use the new API whenever getting videos from a channel

I created the get_channel_videos_response function because now instead
of just getting a single url, there are extra steps involved in getting
the API response for channel videos, and these steps don't need to be
repeated throughout the code.

The only remaining exception is the bypass_captcha function, which still
only makes a request to the old API. I don't know whether this code
needs to be updated to use the new API for captcha bypassing to work
correctly.

* Correctly determine video length with new api

* Remove unnecessary line
This commit is contained in:
Ben Heller 2020-09-02 13:28:57 -07:00 committed by GitHub
parent 13f58d602f
commit 4a6e920d0e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 204 additions and 156 deletions

View File

@ -213,8 +213,7 @@ def fetch_channel(ucid, db, pull_all_videos = true, locale = nil)
page = 1 page = 1
url = produce_channel_videos_url(ucid, page, auto_generated: auto_generated) response = get_channel_videos_response(ucid, page, auto_generated: auto_generated)
response = YT_POOL.client &.get(url)
videos = [] of SearchVideo videos = [] of SearchVideo
begin begin
@ -291,8 +290,7 @@ def fetch_channel(ucid, db, pull_all_videos = true, locale = nil)
ids = [] of String ids = [] of String
loop do loop do
url = produce_channel_videos_url(ucid, page, auto_generated: auto_generated) response = get_channel_videos_response(ucid, page, auto_generated: auto_generated)
response = YT_POOL.client &.get(url)
initial_data = JSON.parse(response.body).as_a.find &.["response"]? initial_data = JSON.parse(response.body).as_a.find &.["response"]?
raise "Could not extract JSON" if !initial_data raise "Could not extract JSON" if !initial_data
videos = extract_videos(initial_data.as_h, author, ucid) videos = extract_videos(initial_data.as_h, author, ucid)
@ -396,7 +394,7 @@ def fetch_channel_playlists(ucid, author, auto_generated, continuation, sort_by)
return items, continuation return items, continuation
end end
def produce_channel_videos_url(ucid, page = 1, auto_generated = nil, sort_by = "newest") def produce_channel_videos_url(ucid, page = 1, auto_generated = nil, sort_by = "newest", v2 = false)
object = { object = {
"80226972:embedded" => { "80226972:embedded" => {
"2:string" => ucid, "2:string" => ucid,
@ -411,18 +409,33 @@ def produce_channel_videos_url(ucid, page = 1, auto_generated = nil, sort_by = "
}, },
} }
if auto_generated if !v2
seed = Time.unix(1525757349) if auto_generated
until seed >= Time.utc seed = Time.unix(1525757349)
seed += 1.month until seed >= Time.utc
end seed += 1.month
timestamp = seed - (page - 1).months end
timestamp = seed - (page - 1).months
object["80226972:embedded"]["3:base64"].as(Hash)["4:varint"] = 0x36_i64 object["80226972:embedded"]["3:base64"].as(Hash)["4:varint"] = 0x36_i64
object["80226972:embedded"]["3:base64"].as(Hash)["15:string"] = "#{timestamp.to_unix}" object["80226972:embedded"]["3:base64"].as(Hash)["15:string"] = "#{timestamp.to_unix}"
else
object["80226972:embedded"]["3:base64"].as(Hash)["4:varint"] = 0_i64
object["80226972:embedded"]["3:base64"].as(Hash)["15:string"] = "#{page}"
end
else else
object["80226972:embedded"]["3:base64"].as(Hash)["4:varint"] = 0_i64 object["80226972:embedded"]["3:base64"].as(Hash)["4:varint"] = 0_i64
object["80226972:embedded"]["3:base64"].as(Hash)["15:string"] = "#{page}"
object["80226972:embedded"]["3:base64"].as(Hash)["61:string"] = Base64.urlsafe_encode(Protodec::Any.from_json(Protodec::Any.cast_json({
"1:embedded" => {
"1:varint" => 6307666885028338688_i64,
"2:embedded" => {
"1:string" => Base64.urlsafe_encode(Protodec::Any.from_json(Protodec::Any.cast_json({
"1:varint" => 30_i64 * (page - 1),
}))),
},
},
})))
end end
case sort_by case sort_by
@ -901,12 +914,28 @@ def get_about_info(ucid, locale)
}) })
end end
def get_channel_videos_response(ucid, page = 1, auto_generated = nil, sort_by = "newest")
url = produce_channel_videos_url(ucid, page, auto_generated: auto_generated, sort_by: sort_by, v2: false)
response = YT_POOL.client &.get(url)
initial_data = JSON.parse(response.body).as_a.find &.["response"]?
return response if !initial_data
needs_v2 = initial_data
.try &.["response"]?.try &.["alerts"]?
.try &.as_a.any? { |alert|
alert.try &.["alertRenderer"]?.try &.["type"]?.try { |t| t == "ERROR" }
}
if needs_v2
url = produce_channel_videos_url(ucid, page, auto_generated: auto_generated, sort_by: sort_by, v2: true)
response = YT_POOL.client &.get(url)
end
response
end
def get_60_videos(ucid, author, page, auto_generated, sort_by = "newest") def get_60_videos(ucid, author, page, auto_generated, sort_by = "newest")
videos = [] of SearchVideo videos = [] of SearchVideo
2.times do |i| 2.times do |i|
url = produce_channel_videos_url(ucid, page * 2 + (i - 1), auto_generated: auto_generated, sort_by: sort_by) response = get_channel_videos_response(ucid, page * 2 + (i - 1), auto_generated: auto_generated, sort_by: sort_by)
response = YT_POOL.client &.get(url)
initial_data = JSON.parse(response.body).as_a.find &.["response"]? initial_data = JSON.parse(response.body).as_a.find &.["response"]?
break if !initial_data break if !initial_data
videos.concat extract_videos(initial_data.as_h, author, ucid) videos.concat extract_videos(initial_data.as_h, author, ucid)
@ -916,8 +945,7 @@ def get_60_videos(ucid, author, page, auto_generated, sort_by = "newest")
end end
def get_latest_videos(ucid) def get_latest_videos(ucid)
url = produce_channel_videos_url(ucid, 0) response = get_channel_videos_response(ucid, 1)
response = YT_POOL.client &.get(url)
initial_data = JSON.parse(response.body).as_a.find &.["response"]? initial_data = JSON.parse(response.body).as_a.find &.["response"]?
return [] of SearchVideo if !initial_data return [] of SearchVideo if !initial_data
author = initial_data["response"]?.try &.["metadata"]?.try &.["channelMetadataRenderer"]?.try &.["title"]?.try &.as_s author = initial_data["response"]?.try &.["metadata"]?.try &.["channelMetadataRenderer"]?.try &.["title"]?.try &.as_s

View File

@ -164,148 +164,168 @@ def extract_videos(initial_data : Hash(String, JSON::Any), author_fallback : Str
extract_items(initial_data, author_fallback, author_id_fallback).select(&.is_a?(SearchVideo)).map(&.as(SearchVideo)) extract_items(initial_data, author_fallback, author_id_fallback).select(&.is_a?(SearchVideo)).map(&.as(SearchVideo))
end end
def extract_item(item : JSON::Any, author_fallback : String? = nil, author_id_fallback : String? = nil)
if i = (item["videoRenderer"]? || item["gridVideoRenderer"]?)
video_id = i["videoId"].as_s
title = i["title"].try { |t| t["simpleText"]?.try &.as_s || t["runs"]?.try &.as_a.map(&.["text"].as_s).join("") } || ""
author_info = i["ownerText"]?.try &.["runs"].as_a[0]?
author = author_info.try &.["text"].as_s || author_fallback || ""
author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_id_fallback || ""
published = i["publishedTimeText"]?.try &.["simpleText"]?.try { |t| decode_date(t.as_s) } || Time.local
view_count = i["viewCountText"]?.try &.["simpleText"]?.try &.as_s.gsub(/\D+/, "").to_i64? || 0_i64
description_html = i["descriptionSnippet"]?.try { |t| parse_content(t) } || ""
length_seconds = i["lengthText"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } ||
i["thumbnailOverlays"]?.try &.as_a.find(&.["thumbnailOverlayTimeStatusRenderer"]?).try &.["thumbnailOverlayTimeStatusRenderer"]?
.try &.["text"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || 0
live_now = false
paid = false
premium = false
premiere_timestamp = i["upcomingEventData"]?.try &.["startTime"]?.try { |t| Time.unix(t.as_s.to_i64) }
i["badges"]?.try &.as_a.each do |badge|
b = badge["metadataBadgeRenderer"]
case b["label"].as_s
when "LIVE NOW"
live_now = true
when "New", "4K", "CC"
# TODO
when "Premium"
paid = true
# TODO: Potentially available as i["topStandaloneBadge"]["metadataBadgeRenderer"]
premium = true
else nil # Ignore
end
end
SearchVideo.new({
title: title,
id: video_id,
author: author,
ucid: author_id,
published: published,
views: view_count,
description_html: description_html,
length_seconds: length_seconds,
live_now: live_now,
paid: paid,
premium: premium,
premiere_timestamp: premiere_timestamp,
})
elsif i = item["channelRenderer"]?
author = i["title"]["simpleText"]?.try &.as_s || author_fallback || ""
author_id = i["channelId"]?.try &.as_s || author_id_fallback || ""
author_thumbnail = i["thumbnail"]["thumbnails"]?.try &.as_a[0]?.try { |u| "https:#{u["url"]}" } || ""
subscriber_count = i["subscriberCountText"]?.try &.["simpleText"]?.try &.as_s.try { |s| short_text_to_number(s.split(" ")[0]) } || 0
auto_generated = false
auto_generated = true if !i["videoCountText"]?
video_count = i["videoCountText"]?.try &.["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0
description_html = i["descriptionSnippet"]?.try { |t| parse_content(t) } || ""
SearchChannel.new({
author: author,
ucid: author_id,
author_thumbnail: author_thumbnail,
subscriber_count: subscriber_count,
video_count: video_count,
description_html: description_html,
auto_generated: auto_generated,
})
elsif i = item["gridPlaylistRenderer"]?
title = i["title"]["runs"].as_a[0]?.try &.["text"].as_s || ""
plid = i["playlistId"]?.try &.as_s || ""
video_count = i["videoCountText"]["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0
playlist_thumbnail = i["thumbnail"]["thumbnails"][0]?.try &.["url"]?.try &.as_s || ""
SearchPlaylist.new({
title: title,
id: plid,
author: author_fallback || "",
ucid: author_id_fallback || "",
video_count: video_count,
videos: [] of SearchPlaylistVideo,
thumbnail: playlist_thumbnail,
})
elsif i = item["playlistRenderer"]?
title = i["title"]["simpleText"]?.try &.as_s || ""
plid = i["playlistId"]?.try &.as_s || ""
video_count = i["videoCount"]?.try &.as_s.to_i || 0
playlist_thumbnail = i["thumbnails"].as_a[0]?.try &.["thumbnails"]?.try &.as_a[0]?.try &.["url"].as_s || ""
author_info = i["shortBylineText"]?.try &.["runs"].as_a[0]?
author = author_info.try &.["text"].as_s || author_fallback || ""
author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_id_fallback || ""
videos = i["videos"]?.try &.as_a.map do |v|
v = v["childVideoRenderer"]
v_title = v["title"]["simpleText"]?.try &.as_s || ""
v_id = v["videoId"]?.try &.as_s || ""
v_length_seconds = v["lengthText"]?.try &.["simpleText"]?.try { |t| decode_length_seconds(t.as_s) } || 0
SearchPlaylistVideo.new({
title: v_title,
id: v_id,
length_seconds: v_length_seconds,
})
end || [] of SearchPlaylistVideo
# TODO: i["publishedTimeText"]?
SearchPlaylist.new({
title: title,
id: plid,
author: author,
ucid: author_id,
video_count: video_count,
videos: videos,
thumbnail: playlist_thumbnail,
})
elsif i = item["radioRenderer"]? # Mix
# TODO
elsif i = item["showRenderer"]? # Show
# TODO
elsif i = item["shelfRenderer"]?
elsif i = item["horizontalCardListRenderer"]?
elsif i = item["searchPyvRenderer"]? # Ad
end
end
def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil) def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil)
items = [] of SearchItem items = [] of SearchItem
initial_data.try { |t| t["contents"]? || t["response"]? } channel_v2_response = initial_data
.try { |t| t["twoColumnBrowseResultsRenderer"]?.try &.["tabs"].as_a.select(&.["tabRenderer"]?.try &.["selected"].as_bool)[0]?.try &.["tabRenderer"]["content"] || .try &.["response"]?
t["twoColumnSearchResultsRenderer"]?.try &.["primaryContents"] || .try &.["continuationContents"]?
t["continuationContents"]? } .try &.["gridContinuation"]?
.try { |t| t["sectionListRenderer"]? || t["sectionListContinuation"]? } .try &.["items"]?
.try &.["contents"].as_a
.each { |c| c.try &.["itemSectionRenderer"]?.try &.["contents"].as_a
.try { |t| t[0]?.try &.["shelfRenderer"]?.try &.["content"]["expandedShelfContentsRenderer"]?.try &.["items"].as_a ||
t[0]?.try &.["gridRenderer"]?.try &.["items"].as_a || t }
.each { |item|
if i = item["videoRenderer"]?
video_id = i["videoId"].as_s
title = i["title"].try { |t| t["simpleText"]?.try &.as_s || t["runs"]?.try &.as_a.map(&.["text"].as_s).join("") } || ""
author_info = i["ownerText"]?.try &.["runs"].as_a[0]? if channel_v2_response
author = author_info.try &.["text"].as_s || author_fallback || "" channel_v2_response.try &.as_a.each { |item|
author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_id_fallback || "" extract_item(item, author_fallback, author_id_fallback)
.try { |t| items << t }
published = i["publishedTimeText"]?.try &.["simpleText"]?.try { |t| decode_date(t.as_s) } || Time.local }
view_count = i["viewCountText"]?.try &.["simpleText"]?.try &.as_s.gsub(/\D+/, "").to_i64? || 0_i64 else
description_html = i["descriptionSnippet"]?.try { |t| parse_content(t) } || "" initial_data.try { |t| t["contents"]? || t["response"]? }
length_seconds = i["lengthText"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || 0 .try { |t| t["twoColumnBrowseResultsRenderer"]?.try &.["tabs"].as_a.select(&.["tabRenderer"]?.try &.["selected"].as_bool)[0]?.try &.["tabRenderer"]["content"] ||
t["twoColumnSearchResultsRenderer"]?.try &.["primaryContents"] ||
live_now = false t["continuationContents"]? }
paid = false .try { |t| t["sectionListRenderer"]? || t["sectionListContinuation"]? }
premium = false .try &.["contents"].as_a
.each { |c| c.try &.["itemSectionRenderer"]?.try &.["contents"].as_a
premiere_timestamp = i["upcomingEventData"]?.try &.["startTime"]?.try { |t| Time.unix(t.as_s.to_i64) } .try { |t| t[0]?.try &.["shelfRenderer"]?.try &.["content"]["expandedShelfContentsRenderer"]?.try &.["items"].as_a ||
t[0]?.try &.["gridRenderer"]?.try &.["items"].as_a || t }
i["badges"]?.try &.as_a.each do |badge| .each { |item|
b = badge["metadataBadgeRenderer"] extract_item(item, author_fallback, author_id_fallback)
case b["label"].as_s .try { |t| items << t }
when "LIVE NOW" } }
live_now = true end
when "New", "4K", "CC"
# TODO
when "Premium"
paid = true
# TODO: Potentially available as i["topStandaloneBadge"]["metadataBadgeRenderer"]
premium = true
else nil # Ignore
end
end
items << SearchVideo.new({
title: title,
id: video_id,
author: author,
ucid: author_id,
published: published,
views: view_count,
description_html: description_html,
length_seconds: length_seconds,
live_now: live_now,
paid: paid,
premium: premium,
premiere_timestamp: premiere_timestamp,
})
elsif i = item["channelRenderer"]?
author = i["title"]["simpleText"]?.try &.as_s || author_fallback || ""
author_id = i["channelId"]?.try &.as_s || author_id_fallback || ""
author_thumbnail = i["thumbnail"]["thumbnails"]?.try &.as_a[0]?.try { |u| "https:#{u["url"]}" } || ""
subscriber_count = i["subscriberCountText"]?.try &.["simpleText"]?.try &.as_s.try { |s| short_text_to_number(s.split(" ")[0]) } || 0
auto_generated = false
auto_generated = true if !i["videoCountText"]?
video_count = i["videoCountText"]?.try &.["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0
description_html = i["descriptionSnippet"]?.try { |t| parse_content(t) } || ""
items << SearchChannel.new({
author: author,
ucid: author_id,
author_thumbnail: author_thumbnail,
subscriber_count: subscriber_count,
video_count: video_count,
description_html: description_html,
auto_generated: auto_generated,
})
elsif i = item["gridPlaylistRenderer"]?
title = i["title"]["runs"].as_a[0]?.try &.["text"].as_s || ""
plid = i["playlistId"]?.try &.as_s || ""
video_count = i["videoCountText"]["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0
playlist_thumbnail = i["thumbnail"]["thumbnails"][0]?.try &.["url"]?.try &.as_s || ""
items << SearchPlaylist.new({
title: title,
id: plid,
author: author_fallback || "",
ucid: author_id_fallback || "",
video_count: video_count,
videos: [] of SearchPlaylistVideo,
thumbnail: playlist_thumbnail,
})
elsif i = item["playlistRenderer"]?
title = i["title"]["simpleText"]?.try &.as_s || ""
plid = i["playlistId"]?.try &.as_s || ""
video_count = i["videoCount"]?.try &.as_s.to_i || 0
playlist_thumbnail = i["thumbnails"].as_a[0]?.try &.["thumbnails"]?.try &.as_a[0]?.try &.["url"].as_s || ""
author_info = i["shortBylineText"]?.try &.["runs"].as_a[0]?
author = author_info.try &.["text"].as_s || author_fallback || ""
author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_id_fallback || ""
videos = i["videos"]?.try &.as_a.map do |v|
v = v["childVideoRenderer"]
v_title = v["title"]["simpleText"]?.try &.as_s || ""
v_id = v["videoId"]?.try &.as_s || ""
v_length_seconds = v["lengthText"]?.try &.["simpleText"]?.try { |t| decode_length_seconds(t.as_s) } || 0
SearchPlaylistVideo.new({
title: v_title,
id: v_id,
length_seconds: v_length_seconds,
})
end || [] of SearchPlaylistVideo
# TODO: i["publishedTimeText"]?
items << SearchPlaylist.new({
title: title,
id: plid,
author: author,
ucid: author_id,
video_count: video_count,
videos: videos,
thumbnail: playlist_thumbnail,
})
elsif i = item["radioRenderer"]? # Mix
# TODO
elsif i = item["showRenderer"]? # Show
# TODO
elsif i = item["shelfRenderer"]?
elsif i = item["horizontalCardListRenderer"]?
elsif i = item["searchPyvRenderer"]? # Ad
end
} }
items items
end end