2019-01-24 23:49:02 +05:30
|
|
|
def crawl_videos(db, logger)
|
2018-08-05 02:00:44 +05:30
|
|
|
ids = Deque(String).new
|
|
|
|
random = Random.new
|
|
|
|
|
2018-08-26 03:48:43 +05:30
|
|
|
search(random.base64(3)).as(Tuple)[1].each do |video|
|
2018-09-20 20:06:09 +05:30
|
|
|
if video.is_a?(SearchVideo)
|
|
|
|
ids << video.id
|
|
|
|
end
|
2018-08-05 02:00:44 +05:30
|
|
|
end
|
|
|
|
|
|
|
|
loop do
|
|
|
|
if ids.empty?
|
2018-08-26 03:48:43 +05:30
|
|
|
search(random.base64(3)).as(Tuple)[1].each do |video|
|
2018-09-20 20:06:09 +05:30
|
|
|
if video.is_a?(SearchVideo)
|
|
|
|
ids << video.id
|
|
|
|
end
|
2018-08-05 02:00:44 +05:30
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
begin
|
|
|
|
id = ids[0]
|
|
|
|
video = get_video(id, db)
|
|
|
|
rescue ex
|
2019-01-24 23:46:10 +05:30
|
|
|
logger.write("#{id} : #{ex.message}\n")
|
2018-08-05 02:00:44 +05:30
|
|
|
next
|
|
|
|
ensure
|
|
|
|
ids.delete(id)
|
|
|
|
end
|
|
|
|
|
|
|
|
rvs = [] of Hash(String, String)
|
2018-08-13 21:20:09 +05:30
|
|
|
video.info["rvs"]?.try &.split(",").each do |rv|
|
|
|
|
rvs << HTTP::Params.parse(rv).to_h
|
2018-08-05 02:00:44 +05:30
|
|
|
end
|
|
|
|
|
|
|
|
rvs.each do |rv|
|
|
|
|
if rv.has_key?("id") && !db.query_one?("SELECT EXISTS (SELECT true FROM videos WHERE id = $1)", rv["id"], as: Bool)
|
|
|
|
ids.delete(id)
|
|
|
|
ids << rv["id"]
|
|
|
|
if ids.size == 150
|
|
|
|
ids.shift
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
Fiber.yield
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-01-24 23:49:02 +05:30
|
|
|
def refresh_channels(db, logger, max_threads = 1, full_refresh = false)
|
2018-08-08 06:55:59 +05:30
|
|
|
max_channel = Channel(Int32).new
|
|
|
|
|
|
|
|
spawn do
|
|
|
|
max_threads = max_channel.receive
|
|
|
|
active_threads = 0
|
|
|
|
active_channel = Channel(Bool).new
|
2018-08-05 02:00:44 +05:30
|
|
|
|
2018-08-08 06:55:59 +05:30
|
|
|
loop do
|
2019-02-19 03:14:15 +05:30
|
|
|
db.query("SELECT id FROM channels ORDER BY updated") do |rs|
|
2018-08-08 06:55:59 +05:30
|
|
|
rs.each do
|
2018-08-05 02:00:44 +05:30
|
|
|
id = rs.read(String)
|
2018-08-08 06:55:59 +05:30
|
|
|
|
|
|
|
if active_threads >= max_threads
|
|
|
|
if active_channel.receive
|
|
|
|
active_threads -= 1
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
active_threads += 1
|
|
|
|
spawn do
|
|
|
|
begin
|
|
|
|
client = make_client(YT_URL)
|
2018-08-09 06:42:17 +05:30
|
|
|
channel = fetch_channel(id, client, db, full_refresh)
|
2018-08-08 06:55:59 +05:30
|
|
|
|
2019-02-19 03:14:15 +05:30
|
|
|
db.exec("UPDATE channels SET updated = $1, author = $2, deleted = false WHERE id = $3", Time.now, channel.author, id)
|
2018-08-08 06:55:59 +05:30
|
|
|
rescue ex
|
2019-02-09 21:48:24 +05:30
|
|
|
if ex.message == "Deleted or invalid channel"
|
|
|
|
db.exec("UPDATE channels SET deleted = true WHERE id = $1", id)
|
|
|
|
end
|
2019-01-24 23:46:10 +05:30
|
|
|
logger.write("#{id} : #{ex.message}\n")
|
2018-08-08 06:55:59 +05:30
|
|
|
end
|
|
|
|
|
|
|
|
active_channel.send(true)
|
|
|
|
end
|
2018-08-05 02:00:44 +05:30
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2018-08-08 06:55:59 +05:30
|
|
|
|
|
|
|
max_channel.send(max_threads)
|
2018-08-05 02:00:44 +05:30
|
|
|
end
|
|
|
|
|
2019-01-24 23:49:02 +05:30
|
|
|
def refresh_videos(db, logger)
|
2018-08-05 02:00:44 +05:30
|
|
|
loop do
|
|
|
|
db.query("SELECT id FROM videos ORDER BY updated") do |rs|
|
|
|
|
rs.each do
|
|
|
|
begin
|
|
|
|
id = rs.read(String)
|
|
|
|
video = get_video(id, db)
|
|
|
|
rescue ex
|
2019-01-24 23:46:10 +05:30
|
|
|
logger.write("#{id} : #{ex.message}\n")
|
2018-08-05 02:00:44 +05:30
|
|
|
next
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
Fiber.yield
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-01-24 23:49:02 +05:30
|
|
|
def refresh_feeds(db, logger, max_threads = 1)
|
2018-10-10 03:54:29 +05:30
|
|
|
max_channel = Channel(Int32).new
|
|
|
|
|
|
|
|
spawn do
|
|
|
|
max_threads = max_channel.receive
|
|
|
|
active_threads = 0
|
|
|
|
active_channel = Channel(Bool).new
|
|
|
|
|
|
|
|
loop do
|
|
|
|
db.query("SELECT email FROM users") do |rs|
|
|
|
|
rs.each do
|
|
|
|
email = rs.read(String)
|
|
|
|
view_name = "subscriptions_#{sha256(email)[0..7]}"
|
|
|
|
|
|
|
|
if active_threads >= max_threads
|
|
|
|
if active_channel.receive
|
|
|
|
active_threads -= 1
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
active_threads += 1
|
|
|
|
spawn do
|
|
|
|
begin
|
|
|
|
db.exec("REFRESH MATERIALIZED VIEW #{view_name}")
|
|
|
|
rescue ex
|
2019-01-24 23:46:10 +05:30
|
|
|
logger.write("REFRESH #{email} : #{ex.message}\n")
|
2018-10-10 03:54:29 +05:30
|
|
|
end
|
2018-10-09 19:10:29 +05:30
|
|
|
|
2018-10-10 03:54:29 +05:30
|
|
|
active_channel.send(true)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2018-10-09 19:10:29 +05:30
|
|
|
end
|
|
|
|
end
|
2018-10-10 03:54:29 +05:30
|
|
|
|
|
|
|
max_channel.send(max_threads)
|
2018-10-09 19:10:29 +05:30
|
|
|
end
|
|
|
|
|
2018-08-05 02:00:44 +05:30
|
|
|
def pull_top_videos(config, db)
|
|
|
|
if config.dl_api_key
|
|
|
|
DetectLanguage.configure do |dl_config|
|
|
|
|
dl_config.api_key = config.dl_api_key.not_nil!
|
|
|
|
end
|
|
|
|
filter = true
|
|
|
|
end
|
|
|
|
|
|
|
|
filter ||= false
|
|
|
|
|
|
|
|
loop do
|
|
|
|
begin
|
|
|
|
top = rank_videos(db, 40, filter, YT_URL)
|
|
|
|
rescue ex
|
|
|
|
next
|
|
|
|
end
|
|
|
|
|
|
|
|
if top.size > 0
|
|
|
|
args = arg_array(top)
|
|
|
|
else
|
|
|
|
next
|
|
|
|
end
|
|
|
|
|
|
|
|
videos = [] of Video
|
|
|
|
|
|
|
|
top.each do |id|
|
|
|
|
begin
|
|
|
|
videos << get_video(id, db)
|
|
|
|
rescue ex
|
|
|
|
next
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
yield videos
|
|
|
|
Fiber.yield
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-11-09 07:38:03 +05:30
|
|
|
def pull_popular_videos(db)
|
|
|
|
loop do
|
|
|
|
subscriptions = PG_DB.query_all("SELECT channel FROM \
|
|
|
|
(SELECT UNNEST(subscriptions) AS channel FROM users) AS d \
|
|
|
|
GROUP BY channel ORDER BY COUNT(channel) DESC LIMIT 40", as: String)
|
|
|
|
|
|
|
|
videos = PG_DB.query_all("SELECT DISTINCT ON (ucid) * FROM \
|
|
|
|
channel_videos WHERE ucid IN (#{arg_array(subscriptions)}) \
|
|
|
|
ORDER BY ucid, published DESC", subscriptions, as: ChannelVideo).sort_by { |video| video.published }.reverse
|
|
|
|
|
|
|
|
yield videos
|
|
|
|
Fiber.yield
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-08-05 02:00:44 +05:30
|
|
|
def update_decrypt_function
|
|
|
|
loop do
|
|
|
|
begin
|
2018-09-15 06:20:11 +05:30
|
|
|
decrypt_function = fetch_decrypt_function
|
2018-08-05 02:00:44 +05:30
|
|
|
rescue ex
|
|
|
|
next
|
|
|
|
end
|
|
|
|
|
|
|
|
yield decrypt_function
|
|
|
|
end
|
|
|
|
end
|
2018-09-26 04:26:59 +05:30
|
|
|
|
|
|
|
def find_working_proxies(regions)
|
2018-10-03 21:08:07 +05:30
|
|
|
loop do
|
|
|
|
regions.each do |region|
|
|
|
|
proxies = get_proxies(region).first(20)
|
|
|
|
proxies = proxies.map { |proxy| {ip: proxy[:ip], port: proxy[:port]} }
|
|
|
|
# proxies = filter_proxies(proxies)
|
2018-09-26 04:26:59 +05:30
|
|
|
|
2018-10-03 21:08:07 +05:30
|
|
|
yield region, proxies
|
|
|
|
Fiber.yield
|
2018-09-26 04:26:59 +05:30
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|