Files
go-away/examples/forgejo.yml
WeebDataHoarder ead41055ca Condition, rules, state and action refactor / rewrite
Add nested rules
Add backend action, allow wildcard in backends
Remove poison from tree, update README with action table

Allow defining pass/fail actions on challenge,

Remove redirect/referer parameters on backend pass

Set challenge cookie tied to host

Rewrite DNSBL condition into a challenge

Allow passing an arbitrary path for assets to js challenges

Optimize programs exhaustively on compilation

Activation instead of map for CEL context, faster map access, new network override

Return valid host on cookie setting in case Host is an IP address.
bug: does not work with IPv6, see https://github.com/golang/go/issues/65521

Apply TLS fingerprinter on GetConfigForClient instead of GetCertificate

Cleanup go-away cookies before passing to backend

Code action for specifically replying with an HTTP code
2025-04-23 20:35:20 +02:00

477 lines
19 KiB
YAML

# Example cmdline (forward requests from upstream to port :8080)
# $ go-away --bind :8080 --backend git.example.com=http://forgejo:3000 --policy examples/forgejo.yml --challenge-template forgejo --challenge-template-theme forgejo-auto
# Define networks to be used later below
networks:
# todo: support direct ASN lookups
# todo: cache these values
huawei-cloud:
# AS136907
- url: https://raw.githubusercontent.com/ipverse/asn-ip/refs/heads/master/as/136907/aggregated.json
jq-path: '.subnets.ipv4[], .subnets.ipv6[]'
alibaba-cloud:
# AS45102
- url: https://raw.githubusercontent.com/ipverse/asn-ip/refs/heads/master/as/45102/aggregated.json
jq-path: '.subnets.ipv4[], .subnets.ipv6[]'
zenlayer-inc:
# AS21859
- url: https://raw.githubusercontent.com/ipverse/asn-ip/refs/heads/master/as/21859/aggregated.json
jq-path: '.subnets.ipv4[], .subnets.ipv6[]'
# aws-cloud:
# - url: https://ip-ranges.amazonaws.com/ip-ranges.json
# jq-path: '(.prefixes[] | select(has("ip_prefix")) | .ip_prefix), (.prefixes[] | select(has("ipv6_prefix")) | .ipv6_prefix)'
# google-cloud:
# - url: https://www.gstatic.com/ipranges/cloud.json
# jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)'
# oracle-cloud:
# - url: https://docs.oracle.com/en-us/iaas/tools/public_ip_ranges.json
# jq-path: '.regions[] | .cidrs[] | .cidr'
# azure-cloud:
# # todo: https://www.microsoft.com/en-us/download/details.aspx?id=56519 does not provide direct JSON
# - url: https://raw.githubusercontent.com/femueller/cloud-ip-ranges/refs/heads/master/microsoft-azure-ip-ranges.json
# jq-path: '.values[] | .properties.addressPrefixes[]'
#
# digitalocean:
# - url: https://www.digitalocean.com/geo/google.csv
# regex: "(?P<prefix>(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+),"
# linode:
# - url: https://geoip.linode.com/
# regex: "(?P<prefix>(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+),"
# vultr:
# - url: "https://geofeed.constant.com/?json"
# jq-path: '.subnets[] | .ip_prefix'
# cloudflare:
# - url: https://www.cloudflare.com/ips-v4
# regex: "(?P<prefix>[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+/[0-9]+)"
# - url: https://www.cloudflare.com/ips-v6
# regex: "(?P<prefix>[0-9a-f:]+::/[0-9]+)"
#
# icloud-private-relay:
# - url: https://mask-api.icloud.com/egress-ip-ranges.csv
# regex: "(?P<prefix>(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+),"
# tunnelbroker-relay:
# # HE Tunnelbroker
# - url: https://tunnelbroker.net/export/google
# regex: "(?P<prefix>([0-9a-f:]+::)/[0-9]+),"
googlebot:
- url: https://developers.google.com/static/search/apis/ipranges/googlebot.json
jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)'
bingbot:
- url: https://www.bing.com/toolbox/bingbot.json
jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)'
qwantbot:
- url: https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json
jq-path: '(.prefixes[] | select(has("ipv4Prefix")) | .ipv4Prefix), (.prefixes[] | select(has("ipv6Prefix")) | .ipv6Prefix)'
duckduckbot:
- url: https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
regex: "<li><div>(?P<prefix>[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)</div></li>"
yandexbot:
# todo: detected as bot
# - url: https://yandex.com/ips
# regex: "<span>(?P<prefix>(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)|([0-9a-f:]+::))/[0-9]+)[ \\\\t]*</span><br/>"
- prefixes:
- "5.45.192.0/18"
- "5.255.192.0/18"
- "37.9.64.0/18"
- "37.140.128.0/18"
- "77.88.0.0/18"
- "84.252.160.0/19"
- "87.250.224.0/19"
- "90.156.176.0/22"
- "93.158.128.0/18"
- "95.108.128.0/17"
- "141.8.128.0/18"
- "178.154.128.0/18"
- "185.32.187.0/24"
- "2a02:6b8::/29"
kagibot:
- url: https://kagi.com/bot
regex: "\\n(?P<prefix>[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+) "
challenges:
js-pow-sha256:
runtime: js
parameters:
# specifies the folder path that assets are under
# can be either embedded or external path
# defaults to name of challenge
path: "js-pow-sha256"
# needs to be under static folder
js-loader: load.mjs
# needs to be under runtime folder
wasm-runtime: runtime.wasm
wasm-runtime-settings:
difficulty: 20
verify-probability: 0.02
# Challenges with a cookie, self redirect (non-JS, requires HTTP parsing)
self-cookie:
runtime: "cookie"
# Challenges with a redirect via Link header with rel=preload and early hints (non-JS, requires HTTP parsing, fetching and logic)
# Works on HTTP/2 and above!
self-preload-link:
condition: '"Sec-Fetch-Mode" in headers && headers["Sec-Fetch-Mode"] == "navigate"'
runtime: "preload-link"
parameters:
preload-early-hint-deadline: 3s
# Challenges with a redirect via Refresh header (non-JS, requires HTTP parsing and logic)
self-header-refresh:
runtime: "refresh"
parameters:
refresh-via: "header"
# Challenges with a redirect via Refresh meta (non-JS, requires HTML parsing and logic)
self-meta-refresh:
runtime: "refresh"
parameters:
refresh-via: "meta"
# Challenges with loading a random CSS or image document (non-JS, requires HTML parsing and logic)
self-resource-load:
runtime: "resource-load"
# Verifies the existence of a cookie and confirms it against some backend request, passing the entire client cookie contents
http-cookie-check:
runtime: http
parameters:
http-url: http://forgejo:3000/user/stopwatches
# http-url: http://forgejo:3000/repo/search
# http-url: http://forgejo:3000/notifications/new
http-method: GET
http-cookie: i_like_gitea
http-code: 200
verify-probability: 0.1
dnsbl:
runtime: dnsbl
parameters:
dnsbl-decay: 1h
dnsbl-timeout: 1s
conditions:
# Conditions will get replaced on rules AST when found as ($condition-name)
# Checks to detect a headless chromium via headers only
is-headless-chromium:
- 'userAgent.contains("HeadlessChrome") || userAgent.contains("HeadlessChromium")'
- '"Sec-Ch-Ua" in headers && (headers["Sec-Ch-Ua"].contains("HeadlessChrome") || headers["Sec-Ch-Ua"].contains("HeadlessChromium"))'
#- '(userAgent.contains("Chrome/") || userAgent.contains("Chromium/")) && (!("Accept-Language" in headers) || !("Accept-Encoding" in headers))'
is-generic-browser:
- 'userAgent.startsWith("Mozilla/") || userAgent.startsWith("Opera/")'
is-well-known-asset:
- 'path == "/robots.txt"'
- 'path.startsWith("/.well-known")'
is-static-asset:
- 'path == "/favicon.ico"'
- 'path == "/apple-touch-icon.png"'
- 'path == "/apple-touch-icon-precomposed.png"'
- 'path.startsWith("/assets/")'
- 'path.startsWith("/repo-avatars/")'
- 'path.startsWith("/avatars/")'
- 'path.startsWith("/avatar/")'
- 'path.startsWith("/user/avatar/")'
- 'path.startsWith("/attachments/")'
is-git-ua:
- 'userAgent.startsWith("git/") || userAgent.contains("libgit")'
- 'userAgent.startsWith("go-git")'
- 'userAgent.startsWith("JGit/") || userAgent.startsWith("JGit-")'
# Golang proxy and initial fetch
- 'userAgent.startsWith("GoModuleMirror/")'
- 'userAgent.startsWith("Go-http-client/") && "go-get" in query && query["go-get"] == "1"'
- '"Git-Protocol" in headers && headers["Git-Protocol"] == "version=2"'
is-git-path:
- 'path.matches("^/[^/]+/[^/]+/(git-upload-pack|git-receive-pack|HEAD|info/refs|info/lfs|objects)")'
is-generic-robot-ua:
- 'userAgent.contains("compatible;") && !userAgent.contains("Trident/")'
- 'userAgent.matches("\\+https?://")'
- 'userAgent.contains("@")'
- 'userAgent.matches("[bB]ot/[0-9]")'
is-tool-ua:
- 'userAgent.startsWith("python-requests/")'
- 'userAgent.startsWith("Python-urllib/")'
- 'userAgent.startsWith("python-httpx/")'
- 'userAgent.contains("aoihttp/")'
- 'userAgent.startsWith("http.rb/")'
- 'userAgent.startsWith("curl/")'
- 'userAgent.startsWith("Wget/")'
- 'userAgent.startsWith("libcurl/")'
- 'userAgent.startsWith("okhttp/")'
- 'userAgent.startsWith("Java/")'
- 'userAgent.startsWith("Apache-HttpClient//")'
- 'userAgent.startsWith("Go-http-client/")'
- 'userAgent.startsWith("node-fetch/")'
- 'userAgent.startsWith("reqwest/")'
is-suspicious-crawler:
# TLS Fingerprint for specific agent without ALPN
- '(userAgent.startsWith("Mozilla/") || userAgent.startsWith("Opera/")) && ("ja4" in fp && fp.ja4.matches("^t[0-9a-z]+00_"))'
# Old engines
- 'userAgent.contains("Presto/") || userAgent.contains("Trident/")'
# Old IE browsers
- 'userAgent.matches("MSIE ([2-9]|10|11)\\.")'
# Old Linux browsers
- 'userAgent.matches("Linux i[63]86") || userAgent.matches("FreeBSD i[63]86")'
# Old Windows browsers
- 'userAgent.matches("Windows (3|95|98|CE)") || userAgent.matches("Windows NT [1-5]\\.")'
# Old mobile browsers
- 'userAgent.matches("Android [1-5]\\.") || userAgent.matches("(iPad|iPhone) OS [1-9]_")'
# Old generic browsers
- 'userAgent.startsWith("Opera/")'
#- 'userAgent.matches("Gecko/(201[0-9]|200[0-9])")'
- 'userAgent.matches("^Mozilla/[1-4]")'
is-heavy-resource:
- 'path.startsWith("/explore/")'
- 'path.matches("^/[^/]+/[^/]+/src/commit/")'
- 'path.matches("^/[^/]+/[^/]+/compare/")'
- 'path.matches("^/[^/]+/[^/]+/commits/commit/")'
- 'path.matches("^/[^/]+/[^/]+/blame/")'
- 'path.matches("^/[^/]+/[^/]+/search/")'
- 'path.matches("^/[^/]+/[^/]+/find/")'
- 'path.matches("^/[^/]+/[^/]+/activity")'
# any search with a custom query
- '"q" in query && query.q != ""'
# user activity tab
- 'path.matches("^/[^/]+$") && "tab" in query && query.tab == "activity"'
rules:
- name: allow-well-known-resources
conditions:
- '($is-well-known-asset)'
action: pass
- name: allow-static-resources
conditions:
- '($is-static-asset)'
action: pass
- name: undesired-networks
conditions:
- 'remoteAddress.network("huawei-cloud") || remoteAddress.network("alibaba-cloud") || remoteAddress.network("zenlayer-inc")'
action: drop
- name: undesired-crawlers
conditions:
- '($is-headless-chromium)'
- 'userAgent.startsWith("Lightpanda/")'
- 'userAgent.startsWith("masscan/")'
# Typo'd opera botnet
- 'userAgent.matches("^Opera/[0-9.]+\\.\\(")'
# AI bullshit stuff, they do not respect robots.txt even while they read it
# TikTok Bytedance AI training
- 'userAgent.contains("Bytedance") || userAgent.contains("Bytespider")'
# Meta AI training; The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.
- 'userAgent.contains("meta-externalagent/") || userAgent.contains("meta-externalfetcher/") || userAgent.contains("FacebookBot")'
# Anthropic AI training and usage
- 'userAgent.contains("ClaudeBot") || userAgent.contains("Claude-User")|| userAgent.contains("Claude-SearchBot")'
# Common Crawl AI crawlers
- 'userAgent.contains("CCBot")'
# ChatGPT AI crawlers https://platform.openai.com/docs/bots
- 'userAgent.contains("GPTBot") || userAgent.contains("OAI-SearchBot") || userAgent.contains("ChatGPT-User")'
# Other AI crawlers
- 'userAgent.contains("Amazonbot") || userAgent.contains("Google-Extended") || userAgent.contains("PanguBot") || userAgent.contains("AI2Bot") || userAgent.contains("Diffbot") || userAgent.contains("cohere-training-data-crawler") || userAgent.contains("Applebot-Extended")'
# SEO / Ads and marketing
- 'userAgent.contains("BLEXBot")'
action: drop
- name: unknown-crawlers
conditions:
# No user agent set
- 'userAgent == ""'
action: deny
# check a sequence of challenges for non logged in
- name: suspicious-crawlers
conditions: ['($is-suspicious-crawler)']
action: none
children:
- name: 0
action: check
settings:
challenges: [js-pow-sha256, http-cookie-check]
- name: 1
action: check
settings:
challenges: [self-preload-link, self-resource-load]
- name: 2
action: check
settings:
challenges: [self-header-refresh]
- name: always-pow-challenge
conditions:
# login and sign up paths
- 'path.startsWith("/user/sign_up")'
- 'path.startsWith("/user/login") || path.startsWith("/user/oauth2/")'
- 'path.startsWith("/user/activate")'
# repo / org / mirror creation paths
- 'path == "/repo/create" || path == "/repo/migrate" || path == "/org/create"'
# user profile info edit paths
- 'path == "/user/settings" || path.startsWith("/user/settings/hooks/")'
# issue creation
- 'path.matches("^/[^/]+/[^/]+/issues/new")'
# Match archive downloads from browsers and not tools
- 'path.matches("^/[^/]+/[^/]+/archive/.*\\.(bundle|zip|tar\\.gz)") && ($is-generic-browser)'
action: challenge
settings:
challenges: [ js-pow-sha256 ]
- name: allow-git-operations
conditions:
- '($is-git-path)'
- 'path.matches("^/[^/]+/[^/]+\\.git")'
- 'path.matches("^/[^/]+/[^/]+/") && ($is-git-ua)'
action: pass
- name: sitemap
conditions:
- 'path == "/sitemap.xml" || path.matches("^/explore/(users|repos)/sitemap-[0-9]+\\.xml$")'
action: pass
# TODO: rss
- name: api-call
conditions:
- 'path.startsWith("/api/v1/") || path.startsWith("/api/forgejo/v1/")'
- 'path.startsWith("/login/oauth/")'
- 'path.startsWith("/captcha/")'
- 'path.startsWith("/metrics/")'
# todo: post only
- 'path == "/-/markup"'
- 'path == "/user/events"'
- 'path == "/ssh_info"'
- 'path == "/api/healthz"'
# actions
- 'path.startsWith("/api/actions/") || path.startsWith("/api/actions_pipeline/")'
# user pubkeys
- 'path.matches("^/[^/]+\\.keys$")'
- 'path.matches("^/[^/]+\\.gpg")'
# OCI packages API and package managers
- 'path.startsWith("/api/packages/") || path == "/api/packages"'
- 'path.startsWith("/v2/") || path == "/v2"'
action: pass
- name: preview-fetchers
conditions:
# These summary cards are included in most previews at the end of the url
- 'path.endsWith("/-/summary-card")'
#- 'userAgent.contains("facebookexternalhit/")'
#- 'userAgent.contains("Twitterbot/")'
action: pass
# Allow loading and embedding of core pages without challenges
# Extended pages like linking to files or tabs are not covered here, but might be included in other challenges
- name: homesite
conditions:
# Match root of site
- 'path == "/"'
# Match root of any repository or user, or issue or pr
# generic /*/*/ match gave too many options for scrapers to trigger random endpoints
# this is a negative match of endpoints that Forgejo holds as reserved as users or orgs
# see https://codeberg.org/forgejo/forgejo/src/branch/forgejo/models/user/user.go#L582
- '(path.matches("^/[^/]+/[^/]+/?$") || path.matches("^/[^/]+/[^/]+/(issues|pulls)/[0-9]+$") || (path.matches("^/[^/]+/?$") && size(query) == 0)) && !path.matches("(?i)^/(api|metrics|v2|assets|attachments|avatar|avatars|repo-avatars|captcha|login|org|repo|user|admin|devtest|explore|issues|pulls|milestones|notifications|ghost)(/|$)")'
action: pass
- name: desired-crawlers
conditions:
- 'userAgent.contains("+https://kagi.com/bot") && remoteAddress.network("kagibot")'
- '(userAgent.contains("+http://www.google.com/bot.html") || userAgent.contains("Google-PageRenderer") || userAgent.contains("Google-InspectionTool") || userAgent.contains("Googlebot")) && remoteAddress.network("googlebot")'
- 'userAgent.contains("+http://www.bing.com/bingbot.htm") && remoteAddress.network("bingbot")'
- 'userAgent.contains("+http://duckduckgo.com/duckduckbot.html") && remoteAddress.network("duckduckbot")'
- 'userAgent.contains("+https://help.qwant.com/bot/") && remoteAddress.network("qwantbot")'
- 'userAgent.contains("+http://yandex.com/bots") && remoteAddress.network("yandexbot")'
action: pass
# check a sequence of challenges
- name: heavy-operations
conditions: ['($is-heavy-resource)']
action: none
children:
- name: 0
action: check
settings:
challenges: [self-preload-link, self-header-refresh, js-pow-sha256, http-cookie-check]
- name: 1
action: check
settings:
challenges: [ self-resource-load, js-pow-sha256, http-cookie-check ]
settings:
challenges: [self-preload-link, self-header-refresh, js-pow-sha256, http-cookie-check]
- name: standard-bots
action: check
settings:
challenges: [self-meta-refresh, self-resource-load]
conditions:
- '($is-generic-robot-ua)'
# Allow all source downloads not caught in browser above
# todo: limit this as needed?
- name: source-download
conditions:
- 'path.matches("^/[^/]+/[^/]+/raw/branch/")'
- 'path.matches("^/[^/]+/[^/]+/archive/")'
- 'path.matches("^/[^/]+/[^/]+/releases/download/")'
- 'path.matches("^/[^/]+/[^/]+/media/") && ($is-generic-browser)'
action: pass
# check DNSBL and serve harder challenges
# todo: make this specific to score
- name: undesired-dnsbl
action: check
settings:
challenges: [dnsbl]
# if DNSBL fails, check additional challenges
fail: check
fail-settings:
challenges: [js-pow-sha256, http-cookie-check]
- name: suspicious-fetchers
action: check
settings:
challenges: [js-pow-sha256]
conditions:
- 'userAgent.contains("facebookexternalhit/") || userAgent.contains("facebookcatalog/")'
# Allow PUT/DELETE/PATCH/POST requests in general
- name: non-get-request
action: pass
conditions:
- '!(method == "HEAD" || method == "GET")'
- name: plaintext-browser
action: challenge
settings:
challenges: [http-cookie-check, self-meta-refresh, self-cookie]
conditions:
- 'userAgent.startsWith("Lynx/")'
- name: standard-tools
action: challenge
settings:
challenges: [self-cookie]
conditions:
- '($is-tool-ua)'
- '!($is-generic-browser)'
- name: standard-browser
action: challenge
settings:
challenges: [http-cookie-check, self-preload-link, self-meta-refresh, self-resource-load, js-pow-sha256]
conditions:
- '($is-generic-browser)'