invidious-experimenting/src/invidious/jobs.cr

257 lines
6.2 KiB
Crystal
Raw Normal View History

2019-01-24 23:49:02 +05:30
def crawl_videos(db, logger)
2018-08-05 02:00:44 +05:30
ids = Deque(String).new
random = Random.new
search(random.base64(3)).as(Tuple)[1].each do |video|
if video.is_a?(SearchVideo)
ids << video.id
end
2018-08-05 02:00:44 +05:30
end
loop do
if ids.empty?
search(random.base64(3)).as(Tuple)[1].each do |video|
if video.is_a?(SearchVideo)
ids << video.id
end
2018-08-05 02:00:44 +05:30
end
end
begin
id = ids[0]
video = get_video(id, db)
rescue ex
2019-01-24 23:46:10 +05:30
logger.write("#{id} : #{ex.message}\n")
2018-08-05 02:00:44 +05:30
next
ensure
ids.delete(id)
end
rvs = [] of Hash(String, String)
2018-08-13 21:20:09 +05:30
video.info["rvs"]?.try &.split(",").each do |rv|
rvs << HTTP::Params.parse(rv).to_h
2018-08-05 02:00:44 +05:30
end
rvs.each do |rv|
if rv.has_key?("id") && !db.query_one?("SELECT EXISTS (SELECT true FROM videos WHERE id = $1)", rv["id"], as: Bool)
ids.delete(id)
ids << rv["id"]
if ids.size == 150
ids.shift
end
end
end
Fiber.yield
end
end
2019-01-24 23:49:02 +05:30
def refresh_channels(db, logger, max_threads = 1, full_refresh = false)
max_channel = Channel(Int32).new
spawn do
max_threads = max_channel.receive
active_threads = 0
active_channel = Channel(Bool).new
2018-08-05 02:00:44 +05:30
loop do
2019-02-19 03:14:15 +05:30
db.query("SELECT id FROM channels ORDER BY updated") do |rs|
rs.each do
2018-08-05 02:00:44 +05:30
id = rs.read(String)
if active_threads >= max_threads
if active_channel.receive
active_threads -= 1
end
end
active_threads += 1
spawn do
begin
channel = fetch_channel(id, db, full_refresh)
2019-02-19 03:14:15 +05:30
db.exec("UPDATE channels SET updated = $1, author = $2, deleted = false WHERE id = $3", Time.now, channel.author, id)
rescue ex
2019-02-09 21:48:24 +05:30
if ex.message == "Deleted or invalid channel"
2019-02-28 05:01:17 +05:30
db.exec("UPDATE channels SET updated = $1, deleted = true WHERE id = $2", Time.now, id)
2019-02-09 21:48:24 +05:30
end
2019-01-24 23:46:10 +05:30
logger.write("#{id} : #{ex.message}\n")
end
active_channel.send(true)
end
2018-08-05 02:00:44 +05:30
end
end
end
end
max_channel.send(max_threads)
2018-08-05 02:00:44 +05:30
end
2019-01-24 23:49:02 +05:30
def refresh_videos(db, logger)
2018-08-05 02:00:44 +05:30
loop do
db.query("SELECT id FROM videos ORDER BY updated") do |rs|
rs.each do
begin
id = rs.read(String)
video = get_video(id, db)
rescue ex
2019-01-24 23:46:10 +05:30
logger.write("#{id} : #{ex.message}\n")
2018-08-05 02:00:44 +05:30
next
end
end
end
Fiber.yield
end
end
2019-01-24 23:49:02 +05:30
def refresh_feeds(db, logger, max_threads = 1)
2018-10-10 03:54:29 +05:30
max_channel = Channel(Int32).new
spawn do
max_threads = max_channel.receive
active_threads = 0
active_channel = Channel(Bool).new
loop do
db.query("SELECT email FROM users") do |rs|
rs.each do
email = rs.read(String)
view_name = "subscriptions_#{sha256(email)[0..7]}"
if active_threads >= max_threads
if active_channel.receive
active_threads -= 1
end
end
active_threads += 1
spawn do
begin
db.query("SELECT * FROM #{view_name} LIMIT 1") do |rs|
# View doesn't contain same number of rows as ChannelVideo
if ChannelVideo.from_rs(rs)[0]?.try &.to_a.size.try &.!= rs.column_count
db.exec("DROP MATERIALIZED VIEW #{view_name}")
raise "valid schema does not exist"
end
end
2018-10-10 03:54:29 +05:30
db.exec("REFRESH MATERIALIZED VIEW #{view_name}")
rescue ex
2019-02-27 20:40:28 +05:30
# Create view if it doesn't exist
2019-03-20 21:31:54 +05:30
if ex.message.try &.ends_with?("does not exist")
# While iterating through, we may have an email stored from a deleted account
if db.query_one?("SELECT true FROM users WHERE email = $1", email, as: Bool)
db.exec("CREATE MATERIALIZED VIEW #{view_name} AS \
SELECT * FROM channel_videos WHERE \
ucid = ANY ((SELECT subscriptions FROM users WHERE email = E'#{email.gsub("'", "\\'")}')::text[]) \
ORDER BY published DESC;")
2019-03-23 02:20:41 +05:30
logger.write("CREATE #{view_name}\n")
2019-03-20 21:31:54 +05:30
end
2019-02-27 20:40:28 +05:30
else
logger.write("REFRESH #{email} : #{ex.message}\n")
end
2018-10-10 03:54:29 +05:30
end
2018-10-10 03:54:29 +05:30
active_channel.send(true)
end
end
end
end
end
2018-10-10 03:54:29 +05:30
max_channel.send(max_threads)
end
def subscribe_to_feeds(db, logger, key, config)
if config.use_pubsub_feeds
spawn do
loop do
2019-03-04 08:10:24 +05:30
db.query_all("SELECT id FROM channels WHERE CURRENT_TIMESTAMP - subscribed > '4 days'") do |rs|
2019-03-06 02:11:38 +05:30
rs.each do
ucid = rs.read(String)
response = subscribe_pubsub(ucid, key, config)
2019-03-06 02:11:38 +05:30
if response.status_code >= 400
logger.write("#{ucid} : #{response.body}\n")
end
end
end
sleep 1.minute
Fiber.yield
end
end
end
end
2018-08-05 02:00:44 +05:30
def pull_top_videos(config, db)
loop do
begin
top = rank_videos(db, 40)
2018-08-05 02:00:44 +05:30
rescue ex
next
end
if top.size > 0
args = arg_array(top)
else
next
end
videos = [] of Video
top.each do |id|
begin
videos << get_video(id, db)
rescue ex
next
end
end
yield videos
Fiber.yield
end
end
2018-11-09 07:38:03 +05:30
def pull_popular_videos(db)
loop do
2019-03-02 03:36:45 +05:30
subscriptions = db.query_all("SELECT channel FROM \
2018-11-09 07:38:03 +05:30
(SELECT UNNEST(subscriptions) AS channel FROM users) AS d \
GROUP BY channel ORDER BY COUNT(channel) DESC LIMIT 40", as: String)
2019-03-02 03:36:45 +05:30
videos = db.query_all("SELECT DISTINCT ON (ucid) * FROM \
2018-11-09 07:38:03 +05:30
channel_videos WHERE ucid IN (#{arg_array(subscriptions)}) \
ORDER BY ucid, published DESC", subscriptions, as: ChannelVideo).sort_by { |video| video.published }.reverse
yield videos
Fiber.yield
end
end
2018-08-05 02:00:44 +05:30
def update_decrypt_function
loop do
begin
2018-09-15 06:20:11 +05:30
decrypt_function = fetch_decrypt_function
2018-08-05 02:00:44 +05:30
rescue ex
next
end
yield decrypt_function
end
end
2018-09-26 04:26:59 +05:30
def find_working_proxies(regions)
2018-10-03 21:08:07 +05:30
loop do
regions.each do |region|
proxies = get_proxies(region).first(20)
proxies = proxies.map { |proxy| {ip: proxy[:ip], port: proxy[:port]} }
# proxies = filter_proxies(proxies)
2018-09-26 04:26:59 +05:30
2018-10-03 21:08:07 +05:30
yield region, proxies
Fiber.yield
2018-09-26 04:26:59 +05:30
end
end
end