# Define networks to be used later below networks: # todo: support direct ASN lookups # todo: cache these values huawei-cloud: # AS136907 - url: https://raw.githubusercontent.com/ipverse/asn-ip/refs/heads/master/as/136907/aggregated.json jq-path: '.subnets.ipv4[], .subnets.ipv6[]' alibaba-cloud: # AS45102 - url: https://raw.githubusercontent.com/ipverse/asn-ip/refs/heads/master/as/45102/aggregated.json jq-path: '.subnets.ipv4[], .subnets.ipv6[]' aws-cloud: - url: https://ip-ranges.amazonaws.com/ip-ranges.json jq-path: '(.prefixes[] | select(has("ip_prefix")) | .ip_prefix), (.prefixes[] | select(has("ipv6_prefix")) | .ipv6_prefix)' google-cloud: - url: https://www.gstatic.com/ipranges/cloud.json jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' oracle-cloud: - url: https://docs.oracle.com/en-us/iaas/tools/public_ip_ranges.json jq-path: '.regions[] | .cidrs[] | .cidr' azure-cloud: # todo: https://www.microsoft.com/en-us/download/details.aspx?id=56519 does not provide direct JSON - url: https://raw.githubusercontent.com/femueller/cloud-ip-ranges/refs/heads/master/microsoft-azure-ip-ranges.json jq-path: '.values[] | .properties.addressPrefixes[]' digitalocean: - url: https://www.digitalocean.com/geo/google.csv regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)," linode: - url: https://geoip.linode.com/ regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)," vultr: - url: "https://geofeed.constant.com/?json" jq-path: '.subnets[] | .ip_prefix' cloudflare: - url: https://www.cloudflare.com/ips-v4 regex: "(?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+/[0-9]+)" - url: https://www.cloudflare.com/ips-v6 regex: "(?P[0-9a-f:]+::/[0-9]+)" icloud-private-relay: - url: https://mask-api.icloud.com/egress-ip-ranges.csv regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)," tunnelbroker-relay: # HE Tunnelbroker - url: https://tunnelbroker.net/export/google regex: "(?P([0-9a-f:]+::)/[0-9]+)," googlebot: - url: https://developers.google.com/static/search/apis/ipranges/googlebot.json jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' bingbot: - url: https://www.bing.com/toolbox/bingbot.json jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' qwantbot: - url: https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)' duckduckbot: - url: https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/ regex: "
  • (?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)
  • " yandexbot: # todo: detected as bot # - url: https://yandex.com/ips # regex: "(?P(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)[ \\\\t]*
    " - prefixes: - "5.45.192.0/18" - "5.255.192.0/18" - "37.9.64.0/18" - "37.140.128.0/18" - "77.88.0.0/18" - "84.252.160.0/19" - "87.250.224.0/19" - "90.156.176.0/22" - "93.158.128.0/18" - "95.108.128.0/17" - "141.8.128.0/18" - "178.154.128.0/18" - "185.32.187.0/24" - "2a02:6b8::/29" kagibot: - url: https://kagi.com/bot regex: "\\n(?P[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+) " # todo: define interface challenges: js-pow-sha256: # Asset must be under challenges/{name}/static/{asset} # Other files here will be available under that path mode: js asset: load.mjs parameters: difficulty: 20 runtime: mode: wasm # Verify must be under challenges/{name}/runtime/{asset} asset: runtime.wasm probability: 0.02 # Challenges with a cookie, self redirect (non-JS, requires HTTP parsing) self-cookie: mode: "cookie" # Challenges with a redirect via header (non-JS, requires HTTP parsing and logic) self-header-refresh: mode: "header-refresh" runtime: # verifies that result = key mode: "key" probability: 0.1 # Challenges with a redirect via meta (non-JS, requires HTML parsing and logic) self-meta-refresh: mode: "meta-refresh" runtime: # verifies that result = key mode: "key" probability: 0.1 # Challenges with loading a random CSS or image document (non-JS, requires HTML parsing and logic) self-resource-load: mode: "resource-load" runtime: # verifies that result = key mode: "key" probability: 0.1 http-cookie-check: mode: http url: http://gitea:3000/user/stopwatches # url: http://gitea:3000/repo/search # url: http://gitea:3000/notifications/new parameters: http-cookie: i_like_gitea http-method: GET http-code: 200 conditions: # Conditions will get replaced on rules AST when found as ($condition-name) # Checks to detect a headless chromium via headers only is-headless-chromium: - 'userAgent.contains("HeadlessChrome") || userAgent.contains("HeadlessChromium")' - '"Sec-Ch-Ua" in headers && (headers["Sec-Ch-Ua"].contains("HeadlessChrome") || headers["Sec-Ch-Ua"].contains("HeadlessChromium"))' - '(userAgent.contains("Chrome/") || userAgent.contains("Chromium/")) && (!("Accept-Language" in headers) || !("Accept-Encoding" in headers))' is-static-asset: - 'path == "/robots.txt"' - 'path == "/favicon.ico"' - 'path == "/apple-touch-icon.png"' - 'path == "/apple-touch-icon-precomposed.png"' - 'path.startsWith("/assets/")' - 'path.startsWith("/repo-avatars/")' - 'path.startsWith("/avatars/")' - 'path.startsWith("/avatar/")' - 'path.startsWith("/attachments/")' is-git-ua: - 'userAgent.startsWith("git/")' - 'userAgent.startsWith("go-git")' - 'userAgent.startsWith("JGit/") || userAgent.startsWith("JGit-")' # Golang proxy and initial fetch - 'userAgent.startsWith("GoModuleMirror/")' - 'userAgent.startsWith("Go-http-client/") && "go-get" in query && query["go-get"] == "1" && (path.matches("^/[^/]+/[^/]+$") || path.matches("^/[^/]+/[^/]+/v[0-9]+$"))' is-git-path: - 'path.matches("^/[^/]+/[^/]+/(git-upload-pack|git-receive-pack|HEAD|info/refs|info/lfs|objects)")' rules: - name: undesired-networks conditions: - 'inNetwork("huawei-cloud", remoteAddress) || inNetwork("alibaba-cloud", remoteAddress)' action: deny - name: undesired-crawlers conditions: - '($is-headless-chromium)' - 'userAgent == ""' - 'userAgent.startsWith("Lightpanda/")' # Typo'd opera botnet - 'userAgent.matches("^Opera/[0-9.]+\\.\\(")' # AI bullshit stuff, they do not respect robots.txt even while they read it - 'userAgent.contains("Amazonbot") || userAgent.contains("Bytespider") || userAgent.contains("CCBot") || userAgent.contains("GPTBot") || userAgent.contains("ClaudeBot") || userAgent.contains("meta-externalagent/")' action: deny - name: suspicious-crawlers conditions: - 'userAgent.contains("Presto/") || userAgent.contains("Trident/")' # Old IE browsers - 'userAgent.matches("MSIE ([4-9]|10|11)\\.")' # Old Linux browsers - 'userAgent.contains("Linux i686")' # Old Windows browsers - 'userAgent.matches("Windows (95|98)") || userAgent.matches("Windows NT [1-4]\\.")' # Old mobile browsers - 'userAgent.matches("Android [1-9]\\.") || userAgent.matches("(iPad|iPhone) OS [1-9]_")' # Old Opera browsers - 'userAgent.startsWith("Opera/")' # check to continue below action: check challenges: [js-pow-sha256, http-cookie-check] - name: always-pow-challenge conditions: - 'path.startsWith("/user/sign_up") || path.startsWith("/user/login")' # Match archive downloads from browsers and not tools - 'path.matches("^/[^/]+/[^/]+/archive/.*\\.(bundle|zip|tar\\.gz)") && (userAgent.startsWith("Opera/") || userAgent.startsWith("Mozilla/"))' action: challenge challenges: [js-pow-sha256] - name: allow-static-resources conditions: - '($is-static-asset)' action: pass - name: allow-git-operations conditions: - '($is-git-path)' - 'path.matches("^/[^/]+/[^/]+\\.git")' - 'path.matches("^/[^/]+/[^/]+/") && ($is-git-ua)' action: pass - name: sitemap conditions: - 'path == "/sitemap.xml" || path.matches("^/explore/(users|repos)/sitemap-[0-9]+\\.xml$")' action: pass # TODO: rss - name: api-call conditions: - 'path.startsWith("/.well-known")' - 'path.startsWith("/api/v1/") || path.startsWith("/api/forgejo/v1/")' - 'path.startsWith("/login/oauth/")' - 'path.startsWith("/captcha/")' - 'path.startsWith("/metrics/")' # todo: post only - 'path == "/-/markup"' - 'path == "/user/events"' - 'path == "/ssh_info"' - 'path == "/api/healthz"' # user pubkeys - 'path.matches("^/[^/]+\\.keys$")' - 'path.matches("^/[^/]+\\.gpg")' action: pass - name: preview-fetchers conditions: - 'path.endsWith("/-/summary-card")' #- 'userAgent.contains("facebookexternalhit/")' - 'userAgent.contains("Twitterbot/")' - '"X-Purpose" in headers && headers["X-Purpose"] == "preview"' action: pass - name: desired-crawlers conditions: - 'userAgent.contains("+https://kagi.com/bot") && inNetwork("kagibot", remoteAddress)' - 'userAgent.contains("+http://www.google.com/bot.html") && inNetwork("googlebot", remoteAddress)' - 'userAgent.contains("+http://www.bing.com/bingbot.htm") && inNetwork("bingbot", remoteAddress)' - 'userAgent.contains("+http://duckduckgo.com/duckduckbot.html") && inNetwork("duckduckbot", remoteAddress)' - 'userAgent.contains("+https://help.qwant.com/bot/") && inNetwork("qwantbot", remoteAddress)' - 'userAgent.contains("+http://yandex.com/bots") && inNetwork("yandexbot", remoteAddress)' action: pass - name: homesite conditions: - 'path == "/"' # generic /*/*/ match gave too many options for scrapers to trigger random endpoints - 'path.matches("(?i)^/(WeebDataHoarder|P2Pool|mirror|git|S\\.O\\.N\\.G|FM10K|Sillycom|pwgen2155|kaitou|metonym)/[^/]+$")' action: pass - name: suspicious-fetchers action: challenge challenges: [js-pow-sha256, http-cookie-check] conditions: - 'userAgent.contains("facebookexternalhit/") || userAgent.contains("facebookcatalog/")' - name: heavy-operations action: check # check we are logged in, or force PoW challenges: [js-pow-sha256, http-cookie-check] conditions: - 'path.startsWith("/explore/")' - 'path.matches("^/[^/]+/[^/]+/src/commit/")' - 'path.matches("^/[^/]+/[^/]+/compare/")' - 'path.matches("^/[^/]+/[^/]+/commits/commit/")' - 'path.matches("^/[^/]+/[^/]+/blame/")' - 'path.matches("^/[^/]+/[^/]+/search/")' - 'path.matches("^/[^/]+/[^/]+/find/")' - 'path.matches("^/[^/]+/[^/]+/activity")' # any search with a custom query - '"q" in query && query.q != ""' # user activity tab - 'path.matches("^/[^/]") && "tab" in query && query.tab == "activity"' # Allow all source downloads not caught in browser above # todo: limit this as needed? - name: source-download conditions: - 'path.matches("^/[^/]+/[^/]+/raw/branch/")' - 'path.matches("^/[^/]+/[^/]+/archive/")' - 'path.matches("^/[^/]+/[^/]+/media/")' - 'path.matches("^/[^/]+/[^/]+/releases/download/")' action: pass - name: standard-browser action: challenge challenges: [http-cookie-check, self-resource-load, self-meta-refresh, js-pow-sha256] conditions: - 'userAgent.startsWith("Mozilla/") || userAgent.startsWith("Opera/")'