# Define networks to be used later below
networks:
  # todo: support direct ASN lookups
  # todo: cache these values
  huawei-cloud:
    # AS136907
    - url: https://raw.githubusercontent.com/ipverse/asn-ip/refs/heads/master/as/136907/aggregated.json
      jq-path: '.subnets.ipv4[], .subnets.ipv6[]'
  alibaba-cloud:
    # AS45102
    - url: https://raw.githubusercontent.com/ipverse/asn-ip/refs/heads/master/as/45102/aggregated.json
      jq-path: '.subnets.ipv4[], .subnets.ipv6[]'
  aws-cloud:
    - url: https://ip-ranges.amazonaws.com/ip-ranges.json
      jq-path: '(.prefixes[] | select(has("ip_prefix")) | .ip_prefix), (.prefixes[] | select(has("ipv6_prefix")) | .ipv6_prefix)'
  google-cloud:
    - url: https://www.gstatic.com/ipranges/cloud.json
      jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)'
  oracle-cloud:
    - url: https://docs.oracle.com/en-us/iaas/tools/public_ip_ranges.json
      jq-path: '.regions[] | .cidrs[] | .cidr'
  azure-cloud:
    # todo: https://www.microsoft.com/en-us/download/details.aspx?id=56519 does not provide direct JSON
    - url: https://raw.githubusercontent.com/femueller/cloud-ip-ranges/refs/heads/master/microsoft-azure-ip-ranges.json
      jq-path: '.values[] | .properties.addressPrefixes[]'

  digitalocean:
    - url: https://www.digitalocean.com/geo/google.csv
      regex: "(?P<prefix>(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+),"
  linode:
    - url: https://geoip.linode.com/
      regex: "(?P<prefix>(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+),"
  vultr:
    - url: "https://geofeed.constant.com/?json"
      jq-path: '.subnets[] | .ip_prefix'
  cloudflare:
    - url: https://www.cloudflare.com/ips-v4
      regex: "(?P<prefix>[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+/[0-9]+)"
    - url: https://www.cloudflare.com/ips-v6
      regex: "(?P<prefix>[0-9a-f:]+::/[0-9]+)"

  icloud-private-relay:
    - url: https://mask-api.icloud.com/egress-ip-ranges.csv
      regex: "(?P<prefix>(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+),"
  tunnelbroker-relay:
    # HE Tunnelbroker
    - url: https://tunnelbroker.net/export/google
      regex: "(?P<prefix>([0-9a-f:]+::)/[0-9]+),"


  googlebot:
    - url: https://developers.google.com/static/search/apis/ipranges/googlebot.json
      jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)'
  bingbot:
    - url: https://www.bing.com/toolbox/bingbot.json
      jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)'
  qwantbot:
    - url: https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json
      jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)'
  duckduckbot:
    - url: https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/
      regex: "<li>(?P<prefix>[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)</li>"
  yandexbot:
    # todo: detected as bot
    # - url: https://yandex.com/ips
    #  regex: "<span>(?P<prefix>(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)[ \\\\t]*</span><br/>"
    - prefixes:
        - "5.45.192.0/18"
        - "5.255.192.0/18"
        - "37.9.64.0/18"
        - "37.140.128.0/18"
        - "77.88.0.0/18"
        - "84.252.160.0/19"
        - "87.250.224.0/19"
        - "90.156.176.0/22"
        - "93.158.128.0/18"
        - "95.108.128.0/17"
        - "141.8.128.0/18"
        - "178.154.128.0/18"
        - "185.32.187.0/24"
        - "2a02:6b8::/29"
  kagibot:
    - url: https://kagi.com/bot
      regex: "\\n(?P<prefix>[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+) "


# todo: define interface
challenges:
  js-pow-sha256:
    # Asset must be under challenges/{name}/static/{asset}
    # Other files here will be available under that path
    mode: js
    asset: load.mjs
    parameters:
      difficulty: 20
    runtime:
      mode: wasm
      # Verify must be under challenges/{name}/runtime/{asset}
      asset: runtime.wasm
      probability: 0.02

  # Challenges with a cookie, self redirect (non-JS, requires HTTP parsing)
  self-cookie:
    mode: "cookie"

  # Challenges with a redirect via header (non-JS, requires HTTP parsing and logic)
  self-header-refresh:
    mode: "header-refresh"
    runtime:
      # verifies that result = key
      mode: "key"
      probability: 0.1

  # Challenges with a redirect via meta (non-JS, requires HTML parsing and logic)
  self-meta-refresh:
    mode: "meta-refresh"
    runtime:
      # verifies that result = key
      mode: "key"
      probability: 0.1

  # Challenges with loading a random CSS or image document (non-JS, requires HTML parsing and logic)
  self-resource-load:
    mode: "resource-load"
    runtime:
      # verifies that result = key
      mode: "key"
      probability: 0.1

  http-cookie-check:
    mode: http
    url: http://gitea:3000/user/stopwatches
    # url: http://gitea:3000/repo/search
    # url: http://gitea:3000/notifications/new
    parameters:
      http-cookie: i_like_gitea
      http-method: GET
      http-code: 200

conditions:
  # Conditions will get replaced on rules AST when found as ($condition-name)
  # Checks to detect a headless chromium via headers only
  is-headless-chromium:
    - 'userAgent.contains("HeadlessChrome") || userAgent.contains("HeadlessChromium")'
    - '"Sec-Ch-Ua" in headers && (headers["Sec-Ch-Ua"].contains("HeadlessChrome") || headers["Sec-Ch-Ua"].contains("HeadlessChromium"))'
    - '(userAgent.contains("Chrome/") || userAgent.contains("Chromium/")) && (!("Accept-Language" in headers) || !("Accept-Encoding" in headers))'
  is-static-asset:
    - 'path == "/robots.txt"'
    - 'path == "/favicon.ico"'
    - 'path == "/apple-touch-icon.png"'
    - 'path == "/apple-touch-icon-precomposed.png"'
    - 'path.startsWith("/assets/")'
    - 'path.startsWith("/repo-avatars/")'
    - 'path.startsWith("/avatars/")'
    - 'path.startsWith("/avatar/")'
    - 'path.startsWith("/attachments/")'
  is-git-ua:
    - 'userAgent.startsWith("git/")'
    - 'userAgent.startsWith("go-git")'
    - 'userAgent.startsWith("JGit/") || userAgent.startsWith("JGit-")'
    # Golang proxy and initial fetch
    - 'userAgent.startsWith("GoModuleMirror/")'
    - 'userAgent.startsWith("Go-http-client/") && "go-get" in query && query["go-get"] == "1" && (path.matches("^/[^/]+/[^/]+$") || path.matches("^/[^/]+/[^/]+/v[0-9]+$"))'
  is-git-path:
    - 'path.matches("^/[^/]+/[^/]+/(git-upload-pack|git-receive-pack|HEAD|info/refs|info/lfs|objects)")'

rules:
  - name: undesired-networks
    conditions:
      - 'inNetwork("huawei-cloud", remoteAddress) || inNetwork("alibaba-cloud", remoteAddress)'
    action: deny

  - name: undesired-crawlers
    conditions:
      - '($is-headless-chromium)'
      - 'userAgent == ""'
      - 'userAgent.startsWith("Lightpanda/")'
      # Typo'd opera botnet
      - 'userAgent.matches("^Opera/[0-9.]+\\.\\(")'
      # AI bullshit stuff, they do not respect robots.txt even while they read it
      - 'userAgent.contains("Amazonbot") || userAgent.contains("Bytespider") || userAgent.contains("CCBot") || userAgent.contains("GPTBot") || userAgent.contains("ClaudeBot") || userAgent.contains("meta-externalagent/")'
    action: deny

  - name: suspicious-crawlers
    conditions:
      - 'userAgent.contains("Presto/") || userAgent.contains("Trident/")'
      # Old IE browsers
      - 'userAgent.matches("MSIE ([4-9]|10|11)\\.")'
      # Old Linux browsers
      - 'userAgent.contains("Linux i686")'
      # Old Windows browsers
      - 'userAgent.matches("Windows (95|98)") || userAgent.matches("Windows NT [1-4]\\.")'
      # Old mobile browsers
      - 'userAgent.matches("Android [1-9]\\.") || userAgent.matches("(iPad|iPhone) OS [1-9]_")'
      # Old Opera browsers
      - 'userAgent.startsWith("Opera/")'
    # check to continue below
    action: check
    challenges: [js-pow-sha256, http-cookie-check]

  - name: always-pow-challenge
    conditions:
      - 'path.startsWith("/user/sign_up") || path.startsWith("/user/login")'
      # Match archive downloads from browsers and not tools
      - 'path.matches("^/[^/]+/[^/]+/archive/.*\\.(bundle|zip|tar\\.gz)") && (userAgent.startsWith("Opera/") || userAgent.startsWith("Mozilla/"))'
    action: challenge
    challenges: [js-pow-sha256]


  - name: allow-static-resources
    conditions:
      - '($is-static-asset)'
    action: pass

  - name: allow-git-operations
    conditions:
      - '($is-git-path)'
      - 'path.matches("^/[^/]+/[^/]+\\.git")'
      - 'path.matches("^/[^/]+/[^/]+/") && ($is-git-ua)'
    action: pass

  - name: sitemap
    conditions:
      - 'path == "/sitemap.xml" || path.matches("^/explore/(users|repos)/sitemap-[0-9]+\\.xml$")'
    action: pass

    # TODO: rss

  - name: api-call
    conditions:
      - 'path.startsWith("/.well-known")'
      - 'path.startsWith("/api/v1/") || path.startsWith("/api/forgejo/v1/")'
      - 'path.startsWith("/login/oauth/")'
      - 'path.startsWith("/captcha/")'
      - 'path.startsWith("/metrics/")'
      # todo: post only
      - 'path == "/-/markup"'
      - 'path == "/user/events"'
      - 'path == "/ssh_info"'
      - 'path == "/api/healthz"'
      # user pubkeys
      - 'path.matches("^/[^/]+\\.keys$")'
      - 'path.matches("^/[^/]+\\.gpg")'
    action: pass

  - name: preview-fetchers
    conditions:
      - 'path.endsWith("/-/summary-card")'
      #- 'userAgent.contains("facebookexternalhit/")'
      - 'userAgent.contains("Twitterbot/")'
      - '"X-Purpose" in headers && headers["X-Purpose"] == "preview"'
    action: pass

  - name: desired-crawlers
    conditions:
      - 'userAgent.contains("+https://kagi.com/bot") && inNetwork("kagibot", remoteAddress)'
      - 'userAgent.contains("+http://www.google.com/bot.html") && inNetwork("googlebot", remoteAddress)'
      - 'userAgent.contains("+http://www.bing.com/bingbot.htm") && inNetwork("bingbot", remoteAddress)'
      - 'userAgent.contains("+http://duckduckgo.com/duckduckbot.html") && inNetwork("duckduckbot", remoteAddress)'
      - 'userAgent.contains("+https://help.qwant.com/bot/") && inNetwork("qwantbot", remoteAddress)'
      - 'userAgent.contains("+http://yandex.com/bots") && inNetwork("yandexbot", remoteAddress)'
    action: pass

  - name: homesite
    conditions:
      - 'path == "/"'
      # generic /*/*/ match gave too many options for scrapers to trigger random endpoints
      - 'path.matches("(?i)^/(WeebDataHoarder|P2Pool|mirror|git|S\\.O\\.N\\.G|FM10K|Sillycom|pwgen2155|kaitou|metonym)/[^/]+$")'
    action: pass

  - name: suspicious-fetchers
    action: challenge
    challenges: [js-pow-sha256, http-cookie-check]
    conditions:
      - 'userAgent.contains("facebookexternalhit/") || userAgent.contains("facebookcatalog/")'


  - name: heavy-operations
    action: check
    # check we are logged in, or force PoW
    challenges: [js-pow-sha256, http-cookie-check]
    conditions:
      - 'path.startsWith("/explore/")'
      - 'path.matches("^/[^/]+/[^/]+/src/commit/")'
      - 'path.matches("^/[^/]+/[^/]+/compare/")'
      - 'path.matches("^/[^/]+/[^/]+/commits/commit/")'
      - 'path.matches("^/[^/]+/[^/]+/blame/")'
      - 'path.matches("^/[^/]+/[^/]+/search/")'
      - 'path.matches("^/[^/]+/[^/]+/find/")'
      - 'path.matches("^/[^/]+/[^/]+/activity")'
      # any search with a custom query
      - '"q" in query && query.q != ""'
      # user activity tab
      - 'path.matches("^/[^/]") && "tab" in query && query.tab == "activity"'

  # Allow all source downloads not caught in browser above
  # todo: limit this as needed?
  - name: source-download
    conditions:
      - 'path.matches("^/[^/]+/[^/]+/raw/branch/")'
      - 'path.matches("^/[^/]+/[^/]+/archive/")'
      - 'path.matches("^/[^/]+/[^/]+/media/")'
      - 'path.matches("^/[^/]+/[^/]+/releases/download/")'
    action: pass

  - name: standard-browser
    action: challenge
    challenges: [http-cookie-check, self-resource-load, self-meta-refresh, js-pow-sha256]
    conditions:
      - 'userAgent.startsWith("Mozilla/") || userAgent.startsWith("Opera/")'