discourse-to-github-archiver/discourse2github.py
c0mmando c2f1fa9345
Refactor code, add features, fix bugs
- Removed duplicate post titles
- Fixed script termination
- Removed duplicates in readme
- Removed double image links
- Clean up post titles
- Organized readme topics by category
- Fix bug preventing archive of more than 20 posts per topic
2025-04-03 02:34:49 +00:00

617 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Archive Discourse posts and render topics to Markdown from multiple sites.
Uses locally archived JSON posts to render Markdown topics. The API is only used
to check/newly fetch posts for a topic. The API endpoints used are:
- https://{defaultHost}/t/{topic_id}.json (for topic metadata)
- https://{defaultHost}/posts/{post_id}.json (for individual posts)
- https://{defaultHost}/c/{slug}/{id}.json (for listing topics by category)
Usage:
./discourse2github.py --urls https://forum.example.org,... --target-dir ./archive
"""
import argparse
import concurrent.futures
import datetime
import functools
import json
import logging
import os
import re
import sys
import time
import urllib.request
from dataclasses import dataclass, field
from pathlib import Path
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
import html2text # pip install html2text
from bs4 import BeautifulSoup # pip install beautifulsoup4
# Logging setup: use rich if available.
lvl = 'DEBUG' if os.environ.get('DEBUG') else 'INFO'
try:
from rich.logging import RichHandler
logging.basicConfig(level=lvl, datefmt="[%X]", handlers=[RichHandler()])
except ImportError:
logging.basicConfig(level=lvl)
log = logging.getLogger('archive')
# Config constants
BATCH_SIZE = 100
SLEEP_SEC = 2
MAX_ITER = 1000
RETRY_MAX = 5 # Maximum retries on error
# Argument Parser
parser = argparse.ArgumentParser(description='Archive and render Discourse topics.')
parser.add_argument('--urls', help='Comma-separated Discourse URLs',
default=os.environ.get('DISCOURSE_URLS', 'https://forum.hackliberty.org'))
parser.add_argument('--debug', action='store_true', default=os.environ.get('DEBUG', False))
parser.add_argument('-t', '--target-dir', help='Base directory for archives',
default=Path(os.environ.get('TARGET_DIR', './archive')))
@functools.cache
def args():
return parser.parse_args()
def parse_sites(urls: str) -> list:
return [u.strip().rstrip('/') for u in urls.split(',') if u.strip()]
# API credentials (optional)
API_KEY = os.environ.get("DISCOURSE_API_KEY", "")
API_USER = os.environ.get("DISCOURSE_API_USERNAME", "")
def fetch_url(url: str, timeout=15) -> str:
"""
Fetch a URL with a retry loop. Logs additional debug info.
If a 404 error is encountered, immediately return None.
For other errors, wait and retry until RETRY_MAX is reached.
"""
backoff = 3
attempts = 0
req = urllib.request.Request(url)
# Add API headers if available.
if API_KEY and API_USER:
req.add_header("Api-Key", API_KEY)
req.add_header("Api-Username", API_USER)
while attempts < RETRY_MAX:
try:
log.debug("Attempt %d: Fetching URL: %s", attempts + 1, url)
with urllib.request.urlopen(req, timeout=timeout) as resp:
data = resp.read().decode()
log.debug(
"Successfully fetched URL: %s | HTTP Status: %s | Response length: %d bytes",
url, resp.status, len(data)
)
return data
except urllib.error.HTTPError as e:
if e.code == 404:
log.warning("Resource not found (404) for %s, skipping further retries", url)
return None
attempts += 1
log.warning("HTTPError fetching %s: %s (attempt %d/%d)", url, e, attempts, RETRY_MAX, exc_info=True)
time.sleep(backoff)
backoff *= 2
except Exception as e:
attempts += 1
log.warning("Error fetching %s: %s (attempt %d/%d)", url, e, attempts, RETRY_MAX, exc_info=True)
time.sleep(backoff)
backoff *= 2
log.error("Failed fetching %s after %d attempts.", url, RETRY_MAX)
return None
def fetch_json(url: str, timeout=15) -> dict:
"""
Fetch JSON data from a URL.
Logs the received raw data size and the parsed JSON keys where applicable.
Returns None if the fetch failed or returned 404.
"""
data = fetch_url(url, timeout)
if data is None:
log.debug("No data returned for URL: %s", url)
return None
log.debug("Fetched raw data from %s (length: %d bytes)", url, len(data))
try:
js = json.loads(data)
if isinstance(js, dict):
log.debug("JSON parsed from %s, keys: %s", url, list(js.keys()))
else:
log.debug("JSON parsed from %s is not a dict (type: %s)", url, type(js).__name__)
return js
except json.JSONDecodeError as e:
log.error("JSON decode error for %s: %s", url, e, exc_info=True)
return None
def truncate_fn(name: str, max_len=255) -> str:
if len(name) <= max_len:
return name
p = Path(name)
stem, suffix = p.stem, "".join(p.suffixes)
allowed = max_len - len(suffix)
return (stem[:allowed] if allowed > 0 else name[:max_len]) + suffix
# --- Helpers for images & HTML content ---
def fix_url(url: str) -> str:
return "https:" + url if url.startswith("//") else url
def download_img(url: str, dest: Path, tid: int = None, timeout=15):
if dest.exists():
log.debug("Img exists for topic %s: %s", tid, dest)
return
attempts = 0
backoff = 2
while attempts < RETRY_MAX:
try:
log.info("Downloading img for topic %s: %s", tid, url)
with urllib.request.urlopen(fix_url(url), timeout=timeout) as r:
data = r.read()
dest.parent.mkdir(parents=True, exist_ok=True)
dest.write_bytes(data)
log.info("Saved img for topic %s to %s", tid, dest)
return
except Exception as e:
attempts += 1
log.warning("Failed downloading img for topic %s from %s: %s (attempt %d/%d)", tid, url, e, attempts, RETRY_MAX)
time.sleep(backoff)
backoff *= 2
log.error("Exceeded maximum retries downloading image %s for topic %s", url, tid)
def proc_srcset(srcset: str, tdir: Path, rel: str, tid: int) -> str:
parts = [e.strip() for e in srcset.split(",")]
out = []
for e in parts:
seg = e.split()
if not seg:
continue
orig = seg[0]
fixed = fix_url(orig)
fname = os.path.basename(urlparse(fixed).path)
if not fname:
log.warning("Empty filename in srcset for topic %s: %s", tid, fixed)
continue
dest = tdir / fname
download_img(fixed, dest, tid)
full = os.path.join(rel, fname).replace(os.sep, '/')
out.append(f"{full} {seg[1]}" if len(seg) > 1 else full)
return ", ".join(out)
def is_img_link(url: str) -> bool:
return os.path.basename(urlparse(url).path).lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".webp"))
def remove_img_anchor(soup):
# Remove anchors that wrap images.
for a in soup.find_all("a"):
if a.find("img"):
a.replace_with(*a.contents)
return soup
def proc_html(html, tdir: Path, rel: str, tid: int) -> str:
soup = BeautifulSoup(html, "html.parser")
cnt = 0
for img in soup.find_all("img"):
src = img.get("src")
if src:
src = fix_url(src)
fname = os.path.basename(urlparse(src).path)
if fname:
dest = tdir / fname
download_img(src, dest, tid)
cnt += 1
img["src"] = os.path.join(rel, fname).replace(os.sep, '/')
else:
log.warning("Empty filename in src for topic %s: %s", tid, src)
if s := img.get("srcset"):
img["srcset"] = proc_srcset(s, tdir, rel, tid)
for a in soup.find_all("a"):
href = a.get("href")
if href:
fixed = fix_url(href)
if is_img_link(fixed):
fname = os.path.basename(urlparse(fixed).path)
if fname:
dest = tdir / fname
download_img(fixed, dest, tid)
cnt += 1
a["href"] = os.path.join(rel, fname).replace(os.sep, '/')
if a.string:
a.string.replace_with("")
else:
log.warning("Empty filename in href for topic %s: %s", tid, fixed)
remove_img_anchor(soup)
log.debug("Processed %d images for topic %s", cnt, tid)
return str(soup)
def slugify(s: str) -> str:
s = re.sub(r'[^a-z0-9\s-]', '', s.strip().lower())
return re.sub(r'[\s-]+', '-', s) or "untitled"
# --- Data models ---
@dataclass(frozen=True)
class PostTopic:
id: int
slug: str
title: str
category_id: int
@dataclass
class Post:
id: int
slug: str
raw: dict
def created_at(self) -> datetime.datetime:
return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00"))
def updated_at(self) -> datetime.datetime:
return datetime.datetime.fromisoformat(self.raw['updated_at'].replace("Z", "+00:00"))
def save(self, d: Path) -> None:
"""Save the post JSON to disk (archive)."""
idstr = str(self.id).zfill(10)
fn = f"{idstr}-{self.raw.get('username', 'anonymous')}-{self.raw.get('topic_slug', 'unknown')}.json"
fn = truncate_fn(fn)
folder = self.created_at().strftime('%Y-%m-%B')
path = d / folder / fn
# Only write if changed.
if path.exists():
try:
ex = json.loads(path.read_text(encoding='utf-8'))
if ex.get("updated_at") == self.raw.get("updated_at"):
log.debug("Post %s unchanged; skip saving.", self.id)
return
except Exception as e:
log.debug("Error reading %s: %s", path, e)
path.parent.mkdir(parents=True, exist_ok=True)
log.info("Saving post %s to %s", self.id, path)
path.write_text(json.dumps(self.raw, indent=2), encoding='utf-8')
@classmethod
def from_json(cls, j: dict) -> 'Post':
return cls(id=j['id'], slug=j.get('topic_slug', 'unknown'), raw=j)
@dataclass
class Topic:
id: int
slug: str
title: str
category_id: int
created_at_str: str
markdown: str = field(default="") # initial markdown content
def created_at(self) -> datetime.datetime:
return datetime.datetime.fromisoformat(self.created_at_str.replace("Z", "+00:00"))
def save_rendered(self, d: Path) -> Path:
date_s = str(self.created_at().date())
fn = f"{date_s}-{self.slug}-id{self.id}.md"
fn = truncate_fn(fn)
folder = self.created_at().strftime('%Y-%m-%B')
path = d / folder / fn
path.parent.mkdir(parents=True, exist_ok=True)
log.info("Saving rendered topic %s to %s", self.id, path)
path.write_text(self.markdown, encoding='utf-8')
return path.relative_to(d.parent)
# --- API fetching for topics and posts ---
def fetch_topic_meta(site: str, topic_id: int) -> dict:
url = f"{site}/t/{topic_id}.json"
result = fetch_json(url)
if result is None:
log.warning("Topic metadata not found for topic %s", topic_id)
return result
def fetch_single_post(site: str, post_id: int) -> dict:
"""
Fetch a single post by post_id from the site.
Logs detailed info upon a successful fetch.
"""
url = f"{site}/posts/{post_id}.json"
result = fetch_json(url)
if result is None:
log.warning("Post %s not found on site %s", post_id, site)
else:
# Log detailed post information if available
username = result.get("username", "unknown")
topic_slug = result.get("topic_slug", "unknown")
created_at = result.get("created_at", "unknown time")
log.debug("Fetched post %s: topic_slug='%s', username='%s', created_at='%s'",
post_id, topic_slug, username, created_at)
# Optionally, you can also log the whole JSON response or its size:
log.debug("Post %s JSON size: %d bytes", post_id, len(json.dumps(result)))
return result
# --- Rendering functions using fresh API post data ---
def render_topic(site: str, topic_id: int, tops_dir: Path, cats: dict) -> dict:
"""
Render each post individually and append it immediately to the topic markdown file.
This version fetches EVERY post in the topic (using additional API calls if needed),
not just the first 20.
"""
topic_meta = fetch_topic_meta(site, topic_id)
if not topic_meta:
log.warning("No metadata found for topic %s; skipping render.", topic_id)
return None
# Use the topic meta from /t/{topic_id}.json
slug = topic_meta.get("slug", "unknown")
title = topic_meta.get("title", "No Title")
category_id = int(topic_meta.get("category_id", 0))
created_at_str = topic_meta.get("created_at", datetime.datetime.now().isoformat())
# Create assets dir for images.
assets = tops_dir.parent / "assets" / "images" / f"{topic_id}"
assets.mkdir(parents=True, exist_ok=True)
folder = datetime.datetime.fromisoformat(created_at_str.replace("Z", "+00:00")).strftime('%Y-%m-%B')
md_dir = tops_dir / folder
rel_path = os.path.relpath(assets, md_dir)
# Create or truncate the markdown topic file
date_s = str(datetime.datetime.fromisoformat(created_at_str.replace("Z", "+00:00")).date())
fn = f"{date_s}-{slug}-id{topic_id}.md"
fn = truncate_fn(fn)
topic_md_path = md_dir / fn
topic_md_path.parent.mkdir(parents=True, exist_ok=True)
log.info("Creating markdown file for topic %s at %s", topic_id, topic_md_path)
# Write the topic title as header
with topic_md_path.open(mode="w", encoding="utf8") as f:
f.write(f"# {title}\n\n")
conv = html2text.HTML2Text()
conv.body_width = 0
# ---- Modified section: Fetch ALL posts for the topic ----
# Get posts from topic_meta (first 20 posts)
posts_meta = topic_meta.get("post_stream", {}).get("posts", [])
# Also get the full post stream (IDs) which might include extra post IDs
full_stream = topic_meta.get("post_stream", {}).get("stream", [])
# Identify extra post IDs that might not be in posts_meta
# (Since posts_meta are typically the first 20 posts.)
extra_ids = [pid for pid in full_stream if pid not in [p.get("id") for p in posts_meta]]
log.debug("Topic %s: %d posts in initial load, %d extra IDs detected.", topic_id, len(posts_meta), len(extra_ids))
# Fetch extras in chunks (say, 20 per request)
n = 20
if extra_ids:
chunks = [extra_ids[i:i+n] for i in range(0, len(extra_ids), n)]
for chunk in chunks:
# Build query string with multiple post_ids[] parameters
qs = "&".join([f"post_ids[]={pid}" for pid in chunk])
posts_extra_url = f"{site}/t/{topic_id}/posts.json?{qs}"
extra_response = fetch_json(posts_extra_url)
if extra_response and "post_stream" in extra_response and "posts" in extra_response["post_stream"]:
extra_posts = extra_response["post_stream"]["posts"]
posts_meta.extend(extra_posts)
else:
log.warning("Failed fetching extra posts for topic %s with URL: %s", topic_id, posts_extra_url)
# Sort posts by (for example) their post_number if available (to preserve original order)
posts_meta.sort(key=lambda p: p.get("post_number", 0))
# ---- End fetch-all posts section ----
# Extract post IDs from the combined posts_meta
post_ids = [post["id"] for post in posts_meta]
log.debug("Processing a total of %d posts for topic %s", len(post_ids), topic_id)
# Now process each post (as before)
for post in posts_meta:
try:
post_id = post.get("id")
log.debug("Processing post ID %s for topic %s", post_id, topic_id)
# Create header for the post and fetch necessary dates
cdt = datetime.datetime.fromisoformat(post.get("created_at").replace("Z", "+00:00"))
udt = datetime.datetime.fromisoformat(post.get("updated_at", "").replace("Z", "+00:00")) if post.get("updated_at") else cdt
hdr = (f"> **Post #{post.get('post_number', 0)}{post.get('username', 'unknown')}**\n"
f"> Created: {cdt.strftime('%Y-%m-%d %H:%M')}\n"
f"> Updated: {udt.strftime('%Y-%m-%d %H:%M')}")
cooked = post.get("cooked", "")
proc = proc_html(cooked, assets, rel_path, topic_id)
md_post = conv.handle(proc)
# Clean up the markdown post
clean_lines = []
for l in md_post.splitlines():
if re.search(r'\S+\s*\d+\s*[×x]\s*\d+\s+\d+(\.\d+)?\s*(KB|MB)$', l, flags=re.IGNORECASE):
continue
clean_lines.append(l)
md_post = "\n".join(clean_lines)
md_post = re.sub(r'(\S+)\s*\d+\s*[×x]\s*\d+\s+\d+(\.\d+)?\s*(KB|MB)', r'\1', md_post, flags=re.IGNORECASE)
section = f"<!-- ✦✦✦ POST START ✦✦✦ -->\n\n{hdr}\n\n{md_post}\n\n<!-- ✦✦✦ POST END ✦✦✦ -->\n\n"
with topic_md_path.open(mode="a", encoding="utf8") as f:
f.write(section)
log.debug("Appended post #%s (ID %s) to topic markdown file", post.get("post_number", "?"), post_id)
time.sleep(0.2) # to ensure sequential API calls (if needed)
except Exception as e:
log.error("Error processing post %s: %s", post.get("id"), e)
# After processing, read the file content and return the topic info.
full_md = topic_md_path.read_text(encoding='utf8')
topic_obj = Topic(
id=topic_id,
slug=slug,
title=title,
category_id=category_id,
created_at_str=created_at_str,
markdown=full_md,
)
rel_saved = topic_obj.save_rendered(tops_dir) # This rewrites the file; that's acceptable.
log.info("Rendered topic %s (%s) with %d posts", topic_obj.id, topic_obj.slug, len(post_ids))
return {"id": topic_id, "title": title, "relative_path": str(rel_saved), "category": cats.get(category_id, "Uncategorized")}
# --- README update functions ---
TOC_PAT = re.compile(r"- $$(?P<title>.+?)$$$(?P<rel>.+?)$ <!-- id: (?P<id>\d+) -->")
def read_readme(root: Path):
rp = root / "README.md"
topics = {}
if rp.exists():
try:
for l in rp.read_text(encoding="utf-8").splitlines():
m = TOC_PAT.match(l.strip())
if m:
tid = int(m.group("id"))
topics[tid] = {"id": tid, "title": m.group("title"), "relative_path": m.group("rel")}
except Exception as e:
log.error("Failed parsing README.md: %s", e)
return topics
def append_readme(root: Path, ntop: dict):
rp = root / "README.md"
header = ["# Archived Discourse Topics", "", "## Table of Contents", ""]
line = f"- [{ntop['title']}]({ntop['relative_path']}) <!-- id: {ntop['id']} -->"
if rp.exists():
try:
lines = rp.read_text(encoding="utf-8").splitlines()
try:
idx = lines.index("## Table of Contents") + 1
while idx < len(lines) and TOC_PAT.match(lines[idx].strip()):
idx += 1
lines.insert(idx, line)
newc = "\n".join(lines)
except ValueError:
newc = "\n".join(header + [line] + [""] + lines)
except Exception as e:
log.error("Error reading README.md: %s", e)
newc = "\n".join(header + [line])
else:
newc = "\n".join(header + [line])
try:
rp.write_text(newc, encoding="utf-8")
log.info("Updated README.md at %s", rp)
except Exception as e:
log.error("Failed writing README.md: %s", e)
def write_readme(site_dir: Path, tops: dict):
rp = site_dir / "README.md"
lines = ["# Archived Discourse Topics", "", "## Table of Contents", ""]
group = {}
for t in tops.values():
group.setdefault(t.get("category", "Uncategorized"), []).append(t)
for cat in sorted(group.keys()):
lines.append(f"### {cat}")
for t in sorted(group[cat], key=lambda x: x["id"]):
lines.append(f"- [{t['title']}]({t['relative_path']}) <!-- id: {t['id']} -->")
lines.append("")
try:
rp.write_text("\n".join(lines), encoding='utf-8')
log.info("Finalized README.md at %s", rp)
except Exception as e:
log.error("Failed writing final README.md: %s", e)
def update_meta(meta_file: Path, meta: dict):
log.debug("Updating meta: %s", meta)
meta_file.write_text(json.dumps(meta, indent=2), encoding='utf-8')
# --- New function to fetch topic IDs using list topics endpoint ---
def fetch_topic_ids(site: str) -> list:
"""
Fetch topic IDs from each category using /c/{slug}/{id}.json endpoint.
Returns a list of topic IDs.
"""
topic_ids = set()
# Get categories data
cats_js = fetch_json(f"{site}/categories.json")
if not cats_js:
log.error("Failed to fetch categories from %s", site)
return list(topic_ids)
cats = cats_js.get("category_list", {}).get("categories", [])
for cat in cats:
cat_id = cat.get("id")
cat_slug = cat.get("slug")
if not cat_id or not cat_slug:
continue
url = f"{site}/c/{cat_slug}/{cat_id}.json"
js = fetch_json(url)
if not js:
log.warning("Failed to fetch topics for category %s using %s", cat_id, url)
continue
topics = js.get("topic_list", {}).get("topics", [])
for t in topics:
tid = t.get("id")
if tid:
topic_ids.add(tid)
log.info("Fetched %d topic IDs from %s", len(topic_ids), site)
return list(topic_ids)
# --- Main processing of a site ---
def process_site(site: str, base: Path):
parsed = urlparse(site)
sname = parsed.hostname or site.replace("https://", "").replace("http://", "").split('/')[0]
log.info("Processing site: %s", site)
sdir = base / sname
posts_d = sdir / 'posts'
tops_d = sdir / 'rendered-topics'
posts_d.mkdir(parents=True, exist_ok=True)
tops_d.mkdir(parents=True, exist_ok=True)
meta_file = sdir / '.metadata.json'
meta = {"archived_topic_ids": {}, "topics": {}}
if meta_file.exists():
try:
meta = json.loads(meta_file.read_text())
except Exception as e:
log.error("Failed reading meta for %s: %s", site, e)
rendered_topics = meta.get("topics", {})
topic_ids_to_process = fetch_topic_ids(site)
log.debug("Topic IDs to process: %s", topic_ids_to_process)
rend_all = {}
with ThreadPoolExecutor(max_workers=10) as executor:
# fetch_cats is needed to provide the category mapping
future_to_tid = {executor.submit(render_topic, site, tid, tops_d, fetch_cats(site)): tid for tid in topic_ids_to_process}
for future in as_completed(future_to_tid):
tid = future_to_tid[future]
try:
rendered = future.result()
if rendered:
rend_all[rendered["id"]] = rendered
meta.setdefault("topics", {})[str(rendered["id"])] = rendered
meta.setdefault("archived_topic_ids", {})[str(rendered["id"])] = {
"rendered_at": datetime.datetime.now().isoformat()
}
update_meta(meta_file, meta)
append_readme(sdir, rendered)
except Exception as e:
log.error("Error rendering topic %s: %s", tid, e)
if rend_all:
write_readme(sdir, rend_all)
else:
log.info("Site %s: No topics rendered; skipping final README.", site)
update_meta(meta_file, meta)
def fetch_cats(site: str) -> dict:
"""Fetch topic categories using the /categories.json endpoint for now."""
try:
js = fetch_json(site + "/categories.json")
cats = js.get("category_list", {}).get("categories", [])
mapping = {int(c["id"]): c["name"] for c in cats}
log.info("Fetched %d categories from %s", len(mapping), site)
return mapping
except Exception as e:
log.error("Failed fetch categories from %s: %s", site, e)
return {}
def main() -> None:
params = args()
base = params.target_dir if isinstance(params.target_dir, Path) else Path(params.target_dir)
base.mkdir(parents=True, exist_ok=True)
sites = parse_sites(params.urls)
if not sites:
log.error("No valid sites provided. Exiting.")
sys.exit(1)
for s in sites:
process_site(s, base)
if __name__ == "__main__":
main()