invidious-experimenting/src/helpers.cr

class Video
  module HTTPParamConverter
    def self.from_rs(rs)
      HTTP::Params.parse(rs.read(String))
    end
  end

  module XMLConverter
    def self.from_rs(rs)
      XML.parse_html(rs.read(String))
    end
  end

  def initialize(id, info, updated, title, views, likes, dislikes, wilson_score, published, description)
    @id = id
    @info = info
    @updated = updated
    @title = title
    @views = views
    @likes = likes
    @dislikes = dislikes
    @wilson_score = wilson_score
    @published = published
    @description = description
  end

  def to_a
    return [@id, @info, @updated, @title, @views, @likes, @dislikes, @wilson_score, @published, @description]
  end

  DB.mapping({
    id:   String,
    info: {
      type:      HTTP::Params,
      default:   HTTP::Params.parse(""),
      converter: Video::HTTPParamConverter,
    },
    updated:      Time,
    title:        String,
    views:        Int64,
    likes:        Int32,
    dislikes:     Int32,
    wilson_score: Float64,
    published:    Time,
    description:  String,
  })
end

class RedditSubmit
  JSON.mapping({
    data: RedditSubmitData,
  })
end

class RedditSubmitData
  JSON.mapping({
    children: Array(RedditThread),
  })
end

class RedditThread
  JSON.mapping({
    data: RedditThreadData,
  })
end

class RedditThreadData
  JSON.mapping({
    subreddit:    String,
    id:           String,
    num_comments: Int32,
    score:        Int32,
    author:       String,
    permalink:    String,
    title:        String,
  })
end

# See http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
def ci_lower_bound(pos, n)
  if n == 0
    return 0.0
  end

  # z value here represents a confidence level of 0.95
  z = 1.96
  phat = 1.0*pos/n

  return (phat + z*z/(2*n) - z * Math.sqrt((phat*(1 - phat) + z*z/(4*n))/n))/(1 + z*z/n)
end

def elapsed_text(elapsed)
  millis = elapsed.total_milliseconds
  return "#{millis.round(2)}ms" if millis >= 1

  "#{(millis * 1000).round(2)}µs"
end

def get_client(pool)
  while pool.empty?
    sleep rand(0..10).milliseconds
  end

  return pool.shift
end

def fetch_video(id, client)
  begin
    info = client.get("/get_video_info?video_id=#{id}&el=detailpage&ps=default&eurl=&gl=US&hl=en").body
    html = client.get("/watch?v=#{id}").body
  end

  html = XML.parse_html(html)
  info = HTTP::Params.parse(info)

  if info["reason"]?
    info = client.get("/get_video_info?video_id=#{id}&ps=default&eurl=&gl=US&hl=en").body
    info = HTTP::Params.parse(info)
    if info["reason"]?
      raise info["reason"]
    end
  end

  title = info["title"]

  views = info["view_count"].to_i64

  likes = html.xpath_node(%q(//button[@title="I like this"]/span))
  likes = likes ? likes.content.delete(",").to_i : 0

  dislikes = html.xpath_node(%q(//button[@title="I dislike this"]/span))
  dislikes = dislikes ? dislikes.content.delete(",").to_i : 0

  description = html.xpath_node(%q(//p[@id="eow-description"]))
  description = description ? description.to_xml : ""

  wilson_score = ci_lower_bound(likes, likes + dislikes)

  published = html.xpath_node(%q(//strong[contains(@class,"watch-time-text")]))
  if published
    published = published.content
  else
    raise "Could not find date published"
  end

  published = published.lchop("Published ")
  published = published.lchop("Streamed live ")
  published = published.lchop("Started streaming ")
  published = published.lchop("on ")
  published = published.lchop("Scheduled for ")
  if !published.includes?("ago")
    published = Time.parse(published, "%b %-d, %Y")
  else
    # Time matches format "20 hours ago", "40 minutes ago"...
    delta = published.split(" ")[0].to_i
    case published
    when .includes? "minute"
      published = Time.now - delta.minutes
    when .includes? "hour"
      published = Time.now - delta.hours
    else
      raise "Could not parse #{published}"
    end
  end

  video = Video.new(id, info, Time.now, title, views, likes, dislikes, wilson_score, published, description)

  return video
end

def get_video(id, client, db, refresh = true)
  if db.query_one?("SELECT EXISTS (SELECT true FROM videos WHERE id = $1)", id, as: Bool)
    video = db.query_one("SELECT * FROM videos WHERE id = $1", id, as: Video)

    # If record was last updated over an hour ago, refresh (expire param in response lasts for 6 hours)
    if refresh && Time.now - video.updated > 1.hours
      video = fetch_video(id, client)
      db.exec("UPDATE videos SET info = $2, updated = $3,\
       title = $4, views = $5, likes = $6, dislikes = $7, wilson_score = $8,\
      published = $9, description = $10 WHERE id = $1", video.to_a)
    end
  else
    video = fetch_video(id, client)
    db.exec("INSERT INTO videos VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)", video.to_a)
  end

  return video
end

def search(query, client)
  begin
    html = client.get("https://www.youtube.com/results?q=#{query}&sp=EgIQAVAU").body
  end

  html = XML.parse_html(html)

  html.xpath_nodes(%q(//ol[@class="item-section"]/li)).each do |item|
    root = item.xpath_node(%q(div[contains(@class,"yt-lockup-video")]/div))
    if root
      link = root.xpath_node(%q(div[contains(@class,"yt-lockup-thumbnail")]/a/@href))
      if link
        yield link.content.split("=")[1]
      end
    end
  end
end

def splice(a, b)
  c = a[0]
  a[0] = a[b % a.size]
  a[b % a.size] = c
  return a
end

def decrypt_signature(a)
  a = a.split("")

  a.reverse!
  a.delete_at(0..2)
  a = splice(a, 35)
  a.delete_at(0)
  a.reverse!
  a = splice(a, 54)

  return a.join("")
end

def rank_videos(db, n)
  top = [] of {Float64, String}

  db.query("SELECT id, wilson_score, published FROM videos WHERE views > 5000 ORDER BY published DESC LIMIT 10000") do |rs|
    rs.each do
      id = rs.read(String)
      wilson_score = rs.read(Float64)
      published = rs.read(Time)

      # Exponential decay, older videos tend to rank lower
      temperature = wilson_score * Math.exp(-0.000005*((Time.now - published).total_minutes))
      top << {temperature, id}
    end
  end

  top.sort!

  # Make hottest come first
  top.reverse!
  top = top.map { |a, b| b }

  # Return top
  return top[0..n - 1]
end

def make_client(url, context)
  client = HTTP::Client.new(url, context)
  client.read_timeout = 10.seconds
  client.connect_timeout = 10.seconds
  return client
end

def get_reddit_comments(id, client)
  youtube_url = URI.escape("https://youtube.com/watch?v=#{id}")
  search_results = client.get("/submit.json?url=#{youtube_url}").body
  search_results = RedditSubmit.from_json(search_results)

  top_reddit_thread = search_results.data.children.sort_by { |child| child.data.score }[-1]

  comments = client.get("/r/#{top_reddit_thread.data.subreddit}/comments/#{top_reddit_thread.data.id}?sort=top&depth=3").body
  comments = JSON.parse(comments)

  return comments[1]["data"]["children"], top_reddit_thread
end

def template_comments(root)
  html = ""
  root.each do |child|
    if child["data"]["body_html"]?
      author = child["data"]["author"]
      score = child["data"]["score"]
      body_html = HTML.unescape(child["data"]["body_html"].as_s)

      replies_html = ""
      if child["data"]["replies"] != ""
        replies_html = template_comments(child["data"]["replies"]["data"]["children"])
      end

      # TODO: Allow for expanding comments instead of just dismissing them

      content = <<-END_HTML
      <p>
        <a class="link" href="javascript:void(0)" onclick="dismiss(this.parentNode.parentNode)">[ - ]</a> 
        #{score} 
        <b>#{author}</b> 
      </p>
      <p>#{body_html}</p>
      #{replies_html}
      END_HTML

      if child["data"]["depth"].as_i > 0
        html += <<-END_HTML
          <div class="pure-g">
          <div class="pure-u-1-24"></div>
          <div class="pure-u-23-24">
          #{content}
          </div>
          </div>
        END_HTML
      else
        html += <<-END_HTML
          <div class="pure-g">
          <div class="pure-u-1">
          #{content}
          </div>
          </div>
        END_HTML
      end
    end
  end

  html = html.gsub(/(https:\/\/)|(http:\/\/)?(www\.)?(youtube\.com)/, "")

  return html
end

def number_with_separator(number)
  number.to_s.reverse.gsub(/(\d{3})(?=\d)/, "\\1,").reverse
end
Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`class Video`
			`module HTTPParamConverter`
			`def self.from_rs(rs)`
			`HTTP::Params.parse(rs.read(String))`
			`end`
			`end`

			`module XMLConverter`
			`def self.from_rs(rs)`
			`XML.parse_html(rs.read(String))`
			`end`
			`end`

Remove html from DB 2018-02-27 06:28:45 +05:30			`def initialize(id, info, updated, title, views, likes, dislikes, wilson_score, published, description)`
Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`@id = id`
			`@info = info`
			`@updated = updated`
			`@title = title`
			`@views = views`
			`@likes = likes`
			`@dislikes = dislikes`
			`@wilson_score = wilson_score`
Add published field 2018-02-03 09:14:10 +05:30			`@published = published`
Remove html from DB 2018-02-27 06:28:45 +05:30			`@description = description`
Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`end`

			`def to_a`
Remove html from DB 2018-02-27 06:28:45 +05:30			`return [@id, @info, @updated, @title, @views, @likes, @dislikes, @wilson_score, @published, @description]`
Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`end`

			`DB.mapping({`
			`id: String,`
			`info: {`
			`type: HTTP::Params,`
			`default: HTTP::Params.parse(""),`
			`converter: Video::HTTPParamConverter,`
			`},`
			`updated: Time,`
			`title: String,`
			`views: Int64,`
			`likes: Int32,`
			`dislikes: Int32,`
			`wilson_score: Float64,`
Add published field 2018-02-03 09:14:10 +05:30			`published: Time,`
Remove html from DB 2018-02-27 06:28:45 +05:30			`description: String,`
Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`})`
			`end`

Add reddit comments 2018-03-04 02:36:14 +05:30			`class RedditSubmit`
			`JSON.mapping({`
			`data: RedditSubmitData,`
			`})`
			`end`

			`class RedditSubmitData`
			`JSON.mapping({`
			`children: Array(RedditThread),`
			`})`
			`end`

			`class RedditThread`
			`JSON.mapping({`
			`data: RedditThreadData,`
			`})`
			`end`

			`class RedditThreadData`
			`JSON.mapping({`
			`subreddit: String,`
			`id: String,`
			`num_comments: Int32,`
			`score: Int32,`
			`author: String,`
			`permalink: String,`
			`title: String,`
			`})`
			`end`

Move helpers into seperate file 2018-01-21 05:49:12 +05:30			`# See http://www.evanmiller.org/how-not-to-sort-by-average-rating.html`
			`def ci_lower_bound(pos, n)`
			`if n == 0`
Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`return 0.0`
Move helpers into seperate file 2018-01-21 05:49:12 +05:30			`end`

			`# z value here represents a confidence level of 0.95`
			`z = 1.96`
			`phat = 1.0*pos/n`

			`return (phat + zz/(2n) - z * Math.sqrt((phat(1 - phat) + zz/(4n))/n))/(1 + zz/n)`
			`end`

			`def elapsed_text(elapsed)`
			`millis = elapsed.total_milliseconds`
			`return "#{millis.round(2)}ms" if millis >= 1`

			`"#{(millis * 1000).round(2)}µs"`
			`end`

Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`def get_client(pool)`
			`while pool.empty?`
Move helpers into seperate file 2018-01-21 05:49:12 +05:30			`sleep rand(0..10).milliseconds`
			`end`

Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`return pool.shift`
Move helpers into seperate file 2018-01-21 05:49:12 +05:30			`end`

Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`def fetch_video(id, client)`
			`begin`
			`info = client.get("/get_video_info?video_id=#{id}&el=detailpage&ps=default&eurl=&gl=US&hl=en").body`
			`html = client.get("/watch?v=#{id}").body`
			`end`
Move helpers into seperate file 2018-01-21 05:49:12 +05:30
			`html = XML.parse_html(html)`
Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`info = HTTP::Params.parse(info)`
Move helpers into seperate file 2018-01-21 05:49:12 +05:30
			`if info["reason"]?`
Partially bypass age-gate 2018-02-03 09:34:34 +05:30			`info = client.get("/get_video_info?video_id=#{id}&ps=default&eurl=&gl=US&hl=en").body`
			`info = HTTP::Params.parse(info)`
			`if info["reason"]?`
			`raise info["reason"]`
			`end`
Move helpers into seperate file 2018-01-21 05:49:12 +05:30			`end`

Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`title = info["title"]`

			`views = info["view_count"].to_i64`
Move helpers into seperate file 2018-01-21 05:49:12 +05:30
Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`likes = html.xpath_node(%q(//button[@title="I like this"]/span))`
Fix video indexing 2018-02-05 07:12:13 +05:30			`likes = likes ? likes.content.delete(",").to_i : 0`
Move functions into helpers.cr 2018-01-28 07:39:27 +05:30
			`dislikes = html.xpath_node(%q(//button[@title="I dislike this"]/span))`
			`dislikes = dislikes ? dislikes.content.delete(",").to_i : 0`

Remove html from DB 2018-02-27 06:28:45 +05:30			`description = html.xpath_node(%q(//p[@id="eow-description"]))`
			`description = description ? description.to_xml : ""`

Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`wilson_score = ci_lower_bound(likes, likes + dislikes)`

Fix video indexing 2018-02-05 07:12:13 +05:30			`published = html.xpath_node(%q(//strong[contains(@class,"watch-time-text")]))`
Add published field 2018-02-03 09:14:10 +05:30			`if published`
			`published = published.content`
Fix video indexing 2018-02-05 07:12:13 +05:30			`else`
			`raise "Could not find date published"`
			`end`

			`published = published.lchop("Published ")`
			`published = published.lchop("Streamed live ")`
			`published = published.lchop("Started streaming ")`
			`published = published.lchop("on ")`
			`published = published.lchop("Scheduled for ")`
Run 'crystal tool format' 2018-02-06 05:27:03 +05:30			`if !published.includes?("ago")`
			`published = Time.parse(published, "%b %-d, %Y")`
			`else`
			`# Time matches format "20 hours ago", "40 minutes ago"...`
Fix video indexing 2018-02-05 07:12:13 +05:30			`delta = published.split(" ")[0].to_i`
			`case published`
			`when .includes? "minute"`
			`published = Time.now - delta.minutes`
			`when .includes? "hour"`
			`published = Time.now - delta.hours`
Run 'crystal tool format' 2018-02-06 05:27:03 +05:30			`else`
Fix video indexing 2018-02-05 07:12:13 +05:30			`raise "Could not parse #{published}"`
Run 'crystal tool format' 2018-02-06 05:27:03 +05:30			`end`
Add published field 2018-02-03 09:14:10 +05:30			`end`

Remove html from DB 2018-02-27 06:28:45 +05:30			`video = Video.new(id, info, Time.now, title, views, likes, dislikes, wilson_score, published, description)`
Move helpers into seperate file 2018-01-21 05:49:12 +05:30
			`return video`
			`end`

Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`def get_video(id, client, db, refresh = true)`
			`if db.query_one?("SELECT EXISTS (SELECT true FROM videos WHERE id = $1)", id, as: Bool)`
			`video = db.query_one("SELECT * FROM videos WHERE id = $1", id, as: Video)`
Move helpers into seperate file 2018-01-21 05:49:12 +05:30
Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`# If record was last updated over an hour ago, refresh (expire param in response lasts for 6 hours)`
			`if refresh && Time.now - video.updated > 1.hours`
			`video = fetch_video(id, client)`
Remove html from DB 2018-02-27 06:28:45 +05:30			`db.exec("UPDATE videos SET info = $2, updated = $3,\`
			`title = $4, views = $5, likes = $6, dislikes = $7, wilson_score = $8,\`
			`published = $9, description = $10 WHERE id = $1", video.to_a)`
Move helpers into seperate file 2018-01-21 05:49:12 +05:30			`end`
			`else`
Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`video = fetch_video(id, client)`
Add published field 2018-02-03 09:14:10 +05:30			`db.exec("INSERT INTO videos VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)", video.to_a)`
Move helpers into seperate file 2018-01-21 05:49:12 +05:30			`end`

			`return video`
			`end`
Fix index out of bounds error 2018-01-22 05:19:27 +05:30
Move functions into helpers.cr 2018-01-28 07:39:27 +05:30			`def search(query, client)`
			`begin`
			`html = client.get("https://www.youtube.com/results?q=#{query}&sp=EgIQAVAU").body`
			`end`
Fix index out of bounds error 2018-01-22 05:19:27 +05:30
			`html = XML.parse_html(html)`

			`html.xpath_nodes(%q(//ol[@class="item-section"]/li)).each do \|item\|`
			`root = item.xpath_node(%q(div[contains(@class,"yt-lockup-video")]/div))`
			`if root`
			`link = root.xpath_node(%q(div[contains(@class,"yt-lockup-thumbnail")]/a/@href))`
			`if link`
			`yield link.content.split("=")[1]`
			`end`
			`end`
			`end`
			`end`
Add ability to decrypt signature for vflxuxnEY/en_US/base.js 2018-02-04 02:11:59 +05:30
Replace signature function with generic catchall 2018-02-23 00:31:37 +05:30			`def splice(a, b)`
Add ability to decrypt signature for vflxuxnEY/en_US/base.js 2018-02-04 02:11:59 +05:30			`c = a[0]`
Add support for different signature functions 2018-02-12 09:36:29 +05:30			`a[0] = a[b % a.size]`
			`a[b % a.size] = c`
			`return a`
			`end`

Replace signature function with generic catchall 2018-02-23 00:31:37 +05:30			`def decrypt_signature(a)`
Format and add function for vflCCoUi2 and vflg4IfMn 2018-02-13 22:14:11 +05:30			`a = a.split("")`
Add support for different signature functions 2018-02-12 09:36:29 +05:30
Replace signature function with generic catchall 2018-02-23 00:31:37 +05:30			`a.reverse!`
			`a.delete_at(0..2)`
Update signature 2018-03-03 21:21:58 +05:30			`a = splice(a, 35)`
			`a.delete_at(0)`
			`a.reverse!`
			`a = splice(a, 54)`
Add home page 2018-02-06 05:26:40 +05:30
Format and add function for vflCCoUi2 and vflg4IfMn 2018-02-13 22:14:11 +05:30			`return a.join("")`
			`end`

Add home page 2018-02-06 05:26:40 +05:30			`def rank_videos(db, n)`
			`top = [] of {Float64, String}`

			`db.query("SELECT id, wilson_score, published FROM videos WHERE views > 5000 ORDER BY published DESC LIMIT 10000") do \|rs\|`
			`rs.each do`
			`id = rs.read(String)`
			`wilson_score = rs.read(Float64)`
			`published = rs.read(Time)`

			`# Exponential decay, older videos tend to rank lower`
Fix rank_videos 2018-02-10 21:36:37 +05:30			`temperature = wilson_score * Math.exp(-0.000005*((Time.now - published).total_minutes))`
Add home page 2018-02-06 05:26:40 +05:30			`top << {temperature, id}`
			`end`
			`end`

			`top.sort!`

			`# Make hottest come first`
			`top.reverse!`
			`top = top.map { \|a, b\| b }`

			`# Return top`
Fix rank_videos 2018-02-14 00:44:21 +05:30			`return top[0..n - 1]`
Add home page 2018-02-06 05:26:40 +05:30			`end`
Refactor client creation 2018-02-06 06:37:49 +05:30
			`def make_client(url, context)`
Minor fixes 2018-02-27 06:29:02 +05:30			`client = HTTP::Client.new(url, context)`
Refactor client creation 2018-02-06 06:37:49 +05:30			`client.read_timeout = 10.seconds`
			`client.connect_timeout = 10.seconds`
			`return client`
			`end`
Add reddit comments 2018-03-04 02:36:14 +05:30
			`def get_reddit_comments(id, client)`
			`youtube_url = URI.escape("https://youtube.com/watch?v=#{id}")`
			`search_results = client.get("/submit.json?url=#{youtube_url}").body`
			`search_results = RedditSubmit.from_json(search_results)`

			`top_reddit_thread = search_results.data.children.sort_by { \|child\| child.data.score }[-1]`

			`comments = client.get("/r/#{top_reddit_thread.data.subreddit}/comments/#{top_reddit_thread.data.id}?sort=top&depth=3").body`
			`comments = JSON.parse(comments)`

			`return comments[1]["data"]["children"], top_reddit_thread`
			`end`

			`def template_comments(root)`
			`html = ""`
			`root.each do \|child\|`
			`if child["data"]["body_html"]?`
			`author = child["data"]["author"]`
			`score = child["data"]["score"]`
			`body_html = HTML.unescape(child["data"]["body_html"].as_s)`

			`replies_html = ""`
			`if child["data"]["replies"] != ""`
			`replies_html = template_comments(child["data"]["replies"]["data"]["children"])`
			`end`

			`# TODO: Allow for expanding comments instead of just dismissing them`

			`content = <<-END_HTML`
			`<p>`
			`<a class="link" href="javascript:void(0)" onclick="dismiss(this.parentNode.parentNode)">[ - ]</a>`
			`#{score}`
			`<b>#{author}</b>`
			`</p>`
			`<p>#{body_html}</p>`
			`#{replies_html}`
			`END_HTML`

			`if child["data"]["depth"].as_i > 0`
			`html += <<-END_HTML`
			`<div class="pure-g">`
			`<div class="pure-u-1-24"></div>`
			`<div class="pure-u-23-24">`
			`#{content}`
			`</div>`
			`</div>`
			`END_HTML`
			`else`
			`html += <<-END_HTML`
			`<div class="pure-g">`
			`<div class="pure-u-1">`
			`#{content}`
			`</div>`
			`</div>`
			`END_HTML`
			`end`
			`end`
			`end`

			`html = html.gsub(/(https:\/\/)\|(http:\/\/)?(www\.)?(youtube\.com)/, "")`

			`return html`
			`end`
Separate with comma views, likes, dislikes 2018-03-04 02:40:56 +05:30
			`def number_with_separator(number)`
			`number.to_s.reverse.gsub(/(\d{3})(?=\d)/, "\\1,").reverse`
			`end`