From c2f1fa934583825fa6f27b2186a465b9549f2059 Mon Sep 17 00:00:00 2001 From: c0mmando <103726157+c0mmando@users.noreply.github.com> Date: Thu, 3 Apr 2025 02:34:49 +0000 Subject: [PATCH] Refactor code, add features, fix bugs - Removed duplicate post titles - Fixed script termination - Removed duplicates in readme - Removed double image links - Clean up post titles - Organized readme topics by category - Fix bug preventing archive of more than 20 posts per topic --- discourse2github.py | 974 ++++++++++++++++++++++---------------------- 1 file changed, 487 insertions(+), 487 deletions(-) diff --git a/discourse2github.py b/discourse2github.py index 8b37072..8a9160c 100644 --- a/discourse2github.py +++ b/discourse2github.py @@ -2,19 +2,19 @@ """ Archive Discourse posts and render topics to Markdown from multiple sites. -This script downloads posts from one or more Discourse servers via their APIs. -It archives new posts as JSON files (skipping those already saved or archived), -renders topics to Markdown files for each batch of posts concurrently (with images -downloaded and link URLs rewritten as relative paths), updates a metadata file -after each post is indexed, and then updates a README.md with a table of contents -linking to each archived topic. +Uses locally archived JSON posts to render Markdown topics. The API is only used +to check/newly fetch posts for a topic. The API endpoints used are: + - https://{defaultHost}/t/{topic_id}.json (for topic metadata) + - https://{defaultHost}/posts/{post_id}.json (for individual posts) + - https://{defaultHost}/c/{slug}/{id}.json (for listing topics by category) Usage: - ./discourse2github.py --urls https://forum.hackliberty.org,https://forum.qubes-os.org --target-dir ./archive + ./discourse2github.py --urls https://forum.example.org,... --target-dir ./archive """ import argparse import concurrent.futures +import datetime import functools import json import logging @@ -23,594 +23,594 @@ import re import sys import time import urllib.request -import datetime -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path from urllib.parse import urlparse +from concurrent.futures import ThreadPoolExecutor, as_completed + import html2text # pip install html2text from bs4 import BeautifulSoup # pip install beautifulsoup4 -# Set up logging. If the 'rich' module is available, it will be used. -loglevel = 'DEBUG' if os.environ.get('DEBUG') else 'INFO' +# Logging setup: use rich if available. +lvl = 'DEBUG' if os.environ.get('DEBUG') else 'INFO' try: from rich.logging import RichHandler - logging.basicConfig(level=loglevel, datefmt="[%X]", handlers=[RichHandler()]) + logging.basicConfig(level=lvl, datefmt="[%X]", handlers=[RichHandler()]) except ImportError: - logging.basicConfig(level=loglevel) + logging.basicConfig(level=lvl) log = logging.getLogger('archive') -# Argument parser -parser = argparse.ArgumentParser( - description='Archive topics from one or more Discourse installations and render to Markdown') -parser.add_argument( - '--urls', - help='Comma-separated URLs of Discourse servers (for example: "https://forum.hackliberty.org,https://forum.qubes-os.org")', - default=os.environ.get('DISCOURSE_URLS', 'https://forum.hackliberty.org')) -parser.add_argument( - '--debug', action='store_true', default=os.environ.get('DEBUG', False)) -parser.add_argument( - '-t', '--target-dir', help='Target base directory for the archives', - default=Path(os.environ.get('TARGET_DIR', './archive'))) +# Config constants +BATCH_SIZE = 100 +SLEEP_SEC = 2 +MAX_ITER = 1000 +RETRY_MAX = 5 # Maximum retries on error + +# Argument Parser +parser = argparse.ArgumentParser(description='Archive and render Discourse topics.') +parser.add_argument('--urls', help='Comma-separated Discourse URLs', + default=os.environ.get('DISCOURSE_URLS', 'https://forum.hackliberty.org')) +parser.add_argument('--debug', action='store_true', default=os.environ.get('DEBUG', False)) +parser.add_argument('-t', '--target-dir', help='Base directory for archives', + default=Path(os.environ.get('TARGET_DIR', './archive'))) @functools.cache def args(): return parser.parse_args() -def parse_sites(urls_string: str) -> list: - """Return a list of cleaned-up site URLs.""" - return [url.strip().rstrip('/') for url in urls_string.split(',') if url.strip()] +def parse_sites(urls: str) -> list: + return [u.strip().rstrip('/') for u in urls.split(',') if u.strip()] -def http_get(site_url: str, path: str, timeout: int = 15) -> str: - """Simple HTTP GET with exponential backoff and a timeout.""" - full_url = f"{site_url}{path}" - log.debug("HTTP GET %s", full_url) +# API credentials (optional) +API_KEY = os.environ.get("DISCOURSE_API_KEY", "") +API_USER = os.environ.get("DISCOURSE_API_USERNAME", "") + +def fetch_url(url: str, timeout=15) -> str: + """ + Fetch a URL with a retry loop. Logs additional debug info. + If a 404 error is encountered, immediately return None. + For other errors, wait and retry until RETRY_MAX is reached. + """ backoff = 3 - while True: + attempts = 0 + req = urllib.request.Request(url) + # Add API headers if available. + if API_KEY and API_USER: + req.add_header("Api-Key", API_KEY) + req.add_header("Api-Username", API_USER) + while attempts < RETRY_MAX: try: - with urllib.request.urlopen(full_url, timeout=timeout) as response: - return response.read().decode() - except Exception as e: - log.debug("Error fetching %s: %s -- Retrying in %d seconds", full_url, e, backoff) + log.debug("Attempt %d: Fetching URL: %s", attempts + 1, url) + with urllib.request.urlopen(req, timeout=timeout) as resp: + data = resp.read().decode() + log.debug( + "Successfully fetched URL: %s | HTTP Status: %s | Response length: %d bytes", + url, resp.status, len(data) + ) + return data + except urllib.error.HTTPError as e: + if e.code == 404: + log.warning("Resource not found (404) for %s, skipping further retries", url) + return None + attempts += 1 + log.warning("HTTPError fetching %s: %s (attempt %d/%d)", url, e, attempts, RETRY_MAX, exc_info=True) time.sleep(backoff) backoff *= 2 - if backoff >= 256: - log.exception("Rate limit or unrecoverable error for %s", full_url) - sys.exit(1) + except Exception as e: + attempts += 1 + log.warning("Error fetching %s: %s (attempt %d/%d)", url, e, attempts, RETRY_MAX, exc_info=True) + time.sleep(backoff) + backoff *= 2 + log.error("Failed fetching %s after %d attempts.", url, RETRY_MAX) + return None -def http_get_json(site_url: str, path: str, timeout: int = 15) -> dict: - """Fetch URL contents from a specific site and decode JSON.""" +def fetch_json(url: str, timeout=15) -> dict: + """ + Fetch JSON data from a URL. + Logs the received raw data size and the parsed JSON keys where applicable. + Returns None if the fetch failed or returned 404. + """ + data = fetch_url(url, timeout) + if data is None: + log.debug("No data returned for URL: %s", url) + return None + log.debug("Fetched raw data from %s (length: %d bytes)", url, len(data)) try: - return json.loads(http_get(site_url, path, timeout=timeout)) - except json.JSONDecodeError: - log.warning("Unable to decode JSON response from %r", path) - raise - -# ----- Helper: Truncate Filename ----- -def truncate_filename(filename: str, max_length: int = 255) -> str: - """ - Truncates the file name to a maximum length (default 255 characters). - It preserves the file extension. - """ - if len(filename) <= max_length: - return filename - p = Path(filename) - stem = p.stem - suffix = "".join(p.suffixes) - max_stem_length = max_length - len(suffix) - if max_stem_length <= 0: - return filename[:max_length] - truncated_stem = stem[:max_stem_length] - return truncated_stem + suffix - -# ----- Image / Link Download Helpers ----- -def fix_url(original_url: str) -> str: - """Fix scheme-relative URLs by prepending https: if necessary.""" - if original_url.startswith("//"): - fixed = "https:" + original_url - log.debug("Converted scheme-relative URL: %s -> %s", original_url, fixed) - return fixed - return original_url - -def download_image(image_url: str, dest_path: Path, timeout: int = 15): - """ - Download an image from image_url and save it to dest_path. - If the file already exists, skip downloading. - A timeout is specified to avoid hanging indefinitely. - """ - if dest_path.exists(): - log.debug("Image already downloaded: %s", dest_path) - return - try: - log.info("Downloading image: %s", image_url) - with urllib.request.urlopen(fix_url(image_url), timeout=timeout) as response: - image_data = response.read() - dest_path.parent.mkdir(parents=True, exist_ok=True) - dest_path.write_bytes(image_data) - log.info("Saved image to %s", dest_path) - except Exception as e: - log.error("Failed to download image %s: %s", image_url, e) - -def process_srcset(srcset_value: str, topic_dir: Path, topic_relative_path: str) -> str: - """ - Process a srcset attribute value, downloading images and returning a rewritten value. - Downloads every image referenced regardless of URL content. - """ - entries = srcset_value.split(",") - fixed_entries = [] - for entry in entries: - parts = entry.strip().split() - if not parts: - continue - orig_url = parts[0] - fixed_url = fix_url(orig_url) - parsed = urlparse(fixed_url) - image_filename = os.path.basename(parsed.path) - if not image_filename: - log.warning("Skipping srcset URL with empty filename: %s", fixed_url) - continue - dest_path = topic_dir / image_filename - download_image(fixed_url, dest_path) - full_path = os.path.join(topic_relative_path, image_filename).replace(os.sep, '/') - if len(parts) > 1: - fixed_entries.append(f"{full_path} {parts[1]}") + js = json.loads(data) + if isinstance(js, dict): + log.debug("JSON parsed from %s, keys: %s", url, list(js.keys())) else: - fixed_entries.append(f"{full_path}") - return ", ".join(fixed_entries) + log.debug("JSON parsed from %s is not a dict (type: %s)", url, type(js).__name__) + return js + except json.JSONDecodeError as e: + log.error("JSON decode error for %s: %s", url, e, exc_info=True) + return None -def is_image_link(url: str) -> bool: - """Determine if the URL points to an image by its extension.""" - image_extensions = (".png", ".jpg", ".jpeg", ".gif", ".webp") - parsed = urlparse(url) - filename = os.path.basename(parsed.path).lower() - return filename.endswith(image_extensions) -def process_html(html_content: str, topic_dir: Path, topic_relative_path: str) -> str: - """ - Process the given HTML: download referenced images and rewrite links. - Processes both (src, srcset) and tags pointing to images. - Downloads every image referenced in the HTML. - Returns the modified HTML. - """ - soup = BeautifulSoup(html_content, "html.parser") +def truncate_fn(name: str, max_len=255) -> str: + if len(name) <= max_len: + return name + p = Path(name) + stem, suffix = p.stem, "".join(p.suffixes) + allowed = max_len - len(suffix) + return (stem[:allowed] if allowed > 0 else name[:max_len]) + suffix - # Process tags. +# --- Helpers for images & HTML content --- +def fix_url(url: str) -> str: + return "https:" + url if url.startswith("//") else url + +def download_img(url: str, dest: Path, tid: int = None, timeout=15): + if dest.exists(): + log.debug("Img exists for topic %s: %s", tid, dest) + return + attempts = 0 + backoff = 2 + while attempts < RETRY_MAX: + try: + log.info("Downloading img for topic %s: %s", tid, url) + with urllib.request.urlopen(fix_url(url), timeout=timeout) as r: + data = r.read() + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_bytes(data) + log.info("Saved img for topic %s to %s", tid, dest) + return + except Exception as e: + attempts += 1 + log.warning("Failed downloading img for topic %s from %s: %s (attempt %d/%d)", tid, url, e, attempts, RETRY_MAX) + time.sleep(backoff) + backoff *= 2 + log.error("Exceeded maximum retries downloading image %s for topic %s", url, tid) + +def proc_srcset(srcset: str, tdir: Path, rel: str, tid: int) -> str: + parts = [e.strip() for e in srcset.split(",")] + out = [] + for e in parts: + seg = e.split() + if not seg: + continue + orig = seg[0] + fixed = fix_url(orig) + fname = os.path.basename(urlparse(fixed).path) + if not fname: + log.warning("Empty filename in srcset for topic %s: %s", tid, fixed) + continue + dest = tdir / fname + download_img(fixed, dest, tid) + full = os.path.join(rel, fname).replace(os.sep, '/') + out.append(f"{full} {seg[1]}" if len(seg) > 1 else full) + return ", ".join(out) + +def is_img_link(url: str) -> bool: + return os.path.basename(urlparse(url).path).lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".webp")) + +def remove_img_anchor(soup): + # Remove anchors that wrap images. + for a in soup.find_all("a"): + if a.find("img"): + a.replace_with(*a.contents) + return soup + +def proc_html(html, tdir: Path, rel: str, tid: int) -> str: + soup = BeautifulSoup(html, "html.parser") + cnt = 0 for img in soup.find_all("img"): src = img.get("src") if src: src = fix_url(src) - parsed = urlparse(src) - image_filename = os.path.basename(parsed.path) - if image_filename: - dest_path = topic_dir / image_filename - download_image(src, dest_path) - full_src = os.path.join(topic_relative_path, image_filename).replace(os.sep, '/') - img["src"] = full_src + fname = os.path.basename(urlparse(src).path) + if fname: + dest = tdir / fname + download_img(src, dest, tid) + cnt += 1 + img["src"] = os.path.join(rel, fname).replace(os.sep, '/') else: - log.warning("Skipping image with empty filename from src: %s", src) - srcset = img.get("srcset") - if srcset: - new_srcset = process_srcset(srcset, topic_dir, topic_relative_path) - img["srcset"] = new_srcset - - # Process tags whose href points to images. + log.warning("Empty filename in src for topic %s: %s", tid, src) + if s := img.get("srcset"): + img["srcset"] = proc_srcset(s, tdir, rel, tid) for a in soup.find_all("a"): href = a.get("href") if href: - fixed_href = fix_url(href) - if is_image_link(fixed_href): - parsed = urlparse(fixed_href) - image_filename = os.path.basename(parsed.path) - if image_filename: - dest_path = topic_dir / image_filename - download_image(fixed_href, dest_path) - new_href = os.path.join(topic_relative_path, image_filename).replace(os.sep, '/') - a["href"] = new_href + fixed = fix_url(href) + if is_img_link(fixed): + fname = os.path.basename(urlparse(fixed).path) + if fname: + dest = tdir / fname + download_img(fixed, dest, tid) + cnt += 1 + a["href"] = os.path.join(rel, fname).replace(os.sep, '/') + if a.string: + a.string.replace_with("") else: - log.warning("Skipping link with empty filename from href: %s", fixed_href) + log.warning("Empty filename in href for topic %s: %s", tid, fixed) + remove_img_anchor(soup) + log.debug("Processed %d images for topic %s", cnt, tid) return str(soup) -def slugify(value: str) -> str: - """ - Normalizes string, removes non-alphanumeric characters, and converts whitespace to hyphens. - Useful for constructing filenames. - """ - value = str(value) - value = value.strip().lower() - value = re.sub(r'[^a-z0-9\s-]', '', value) - value = re.sub(r'[\s-]+', '-', value) - return value or "untitled" +def slugify(s: str) -> str: + s = re.sub(r'[^a-z0-9\s-]', '', s.strip().lower()) + return re.sub(r'[\s-]+', '-', s) or "untitled" -# ----- Data Models ----- +# --- Data models --- @dataclass(frozen=True) class PostTopic: id: int slug: str title: str + category_id: int -@dataclass(frozen=True) +@dataclass class Post: id: int slug: str raw: dict - def get_created_at(self) -> datetime.datetime: + def created_at(self) -> datetime.datetime: return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00")) - def save(self, dir: Path): - """Save the raw JSON post to disk if not already archived.""" + def updated_at(self) -> datetime.datetime: + return datetime.datetime.fromisoformat(self.raw['updated_at'].replace("Z", "+00:00")) + + def save(self, d: Path) -> None: + """Save the post JSON to disk (archive).""" idstr = str(self.id).zfill(10) - filename = f"{idstr}-{self.raw.get('username', 'anonymous')}-{self.raw.get('topic_slug', 'unknown')}.json" - filename = truncate_filename(filename) - folder_name = self.get_created_at().strftime('%Y-%m-%B') - full_path = dir / folder_name / filename - - if full_path.exists(): - log.debug("Post %s already saved, skipping", self.id) - return - - full_path.parent.mkdir(parents=True, exist_ok=True) - log.info("Saving post %s to %s", self.id, full_path) - full_path.write_text(json.dumps(self.raw, indent=2), encoding='utf-8') - - def get_topic(self) -> PostTopic: - return PostTopic( - id=self.raw.get('topic_id', self.id), - slug=self.raw.get('topic_slug', self.slug), - title=self.raw.get('topic_title', self.raw.get('title', 'No Title')), - ) + fn = f"{idstr}-{self.raw.get('username', 'anonymous')}-{self.raw.get('topic_slug', 'unknown')}.json" + fn = truncate_fn(fn) + folder = self.created_at().strftime('%Y-%m-%B') + path = d / folder / fn + # Only write if changed. + if path.exists(): + try: + ex = json.loads(path.read_text(encoding='utf-8')) + if ex.get("updated_at") == self.raw.get("updated_at"): + log.debug("Post %s unchanged; skip saving.", self.id) + return + except Exception as e: + log.debug("Error reading %s: %s", path, e) + path.parent.mkdir(parents=True, exist_ok=True) + log.info("Saving post %s to %s", self.id, path) + path.write_text(json.dumps(self.raw, indent=2), encoding='utf-8') @classmethod def from_json(cls, j: dict) -> 'Post': - return cls( - id=j['id'], - slug=j.get('topic_slug', 'unknown'), - raw=j, - ) + return cls(id=j['id'], slug=j.get('topic_slug', 'unknown'), raw=j) -@dataclass(frozen=True) +@dataclass class Topic: id: int slug: str - raw: dict - markdown: str + title: str + category_id: int + created_at_str: str + markdown: str = field(default="") # initial markdown content - def get_created_at(self) -> datetime.datetime: - return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00")) + def created_at(self) -> datetime.datetime: + return datetime.datetime.fromisoformat(self.created_at_str.replace("Z", "+00:00")) - def save_rendered(self, dir: Path): - """ - Save the rendered Markdown topic to disk. - Filename built from creation date, slug, and id. - Truncate the filename if needed. - """ - date_str = str(self.get_created_at().date()) - filename = f"{date_str}-{self.slug}-id{self.id}.md" - filename = truncate_filename(filename) - folder_name = self.get_created_at().strftime('%Y-%m-%B') - full_path = dir / folder_name / filename - full_path.parent.mkdir(parents=True, exist_ok=True) - log.info("Saving rendered topic %s to %s", self.id, full_path) - rendered_markdown = f"# {self.raw.get('title', 'No Title')}\n\n{self.markdown}" - full_path.write_text(rendered_markdown, encoding='utf-8') - # Return the relative path from the repository root. - return full_path.relative_to(dir.parent) + def save_rendered(self, d: Path) -> Path: + date_s = str(self.created_at().date()) + fn = f"{date_s}-{self.slug}-id{self.id}.md" + fn = truncate_fn(fn) + folder = self.created_at().strftime('%Y-%m-%B') + path = d / folder / fn + path.parent.mkdir(parents=True, exist_ok=True) + log.info("Saving rendered topic %s to %s", self.id, path) + path.write_text(self.markdown, encoding='utf-8') + return path.relative_to(d.parent) - @classmethod - def from_json(cls, t: dict, markdown: str) -> 'Topic': - slug = t.get('slug') or t.get('topic_slug') or "unknown" - return cls( - id=t.get('id', 0), - slug=slug, - raw=t, - markdown=markdown, - ) +# --- API fetching for topics and posts --- +def fetch_topic_meta(site: str, topic_id: int) -> dict: + url = f"{site}/t/{topic_id}.json" + result = fetch_json(url) + if result is None: + log.warning("Topic metadata not found for topic %s", topic_id) + return result -# ----- New Helper for Rendering Topics with Image Downloading ----- -def render_topic(site_url: str, topic: PostTopic, topics_dir: Path): +def fetch_single_post(site: str, post_id: int) -> dict: """ - Render a single topic to Markdown by: - 1. Fetching the topic JSON. - 2. Downloading its associated images and rewriting their URLs. - 3. Converting processed HTML to Markdown (using html2text). - 4. Saving the rendered Markdown document. - - Images are saved to an assets directory relative to the site target directory. - Returns a dictionary with topic info for README updating. + Fetch a single post by post_id from the site. + Logs detailed info upon a successful fetch. """ - try: - log.info("Fetching topic %s JSON from %s", topic.id, site_url) - topic_data = http_get_json(site_url, f"/t/{topic.id}.json") - except Exception as e: - log.warning("Failed to fetch topic JSON for topic %s: %s", topic.id, e) + url = f"{site}/posts/{post_id}.json" + result = fetch_json(url) + if result is None: + log.warning("Post %s not found on site %s", post_id, site) + else: + # Log detailed post information if available + username = result.get("username", "unknown") + topic_slug = result.get("topic_slug", "unknown") + created_at = result.get("created_at", "unknown time") + log.debug("Fetched post %s: topic_slug='%s', username='%s', created_at='%s'", + post_id, topic_slug, username, created_at) + # Optionally, you can also log the whole JSON response or its size: + log.debug("Post %s JSON size: %d bytes", post_id, len(json.dumps(result))) + return result + +# --- Rendering functions using fresh API post data --- +def render_topic(site: str, topic_id: int, tops_dir: Path, cats: dict) -> dict: + """ + Render each post individually and append it immediately to the topic markdown file. + This version fetches EVERY post in the topic (using additional API calls if needed), + not just the first 20. + """ + topic_meta = fetch_topic_meta(site, topic_id) + if not topic_meta: + log.warning("No metadata found for topic %s; skipping render.", topic_id) return None - # Define the assets directory in the repository root. - assets_dir = topics_dir.parent / "assets" / "images" / f"{topic.id}" - assets_dir.mkdir(parents=True, exist_ok=True) + # Use the topic meta from /t/{topic_id}.json + slug = topic_meta.get("slug", "unknown") + title = topic_meta.get("title", "No Title") + category_id = int(topic_meta.get("category_id", 0)) + created_at_str = topic_meta.get("created_at", datetime.datetime.now().isoformat()) - # Determine the directory where the rendered markdown file will be saved. - try: - created_at = datetime.datetime.fromisoformat(topic_data['created_at'].replace("Z", "+00:00")) - except Exception as e: - log.error("Could not parse created_at for topic %s: %s", topic.id, e) - created_at = datetime.datetime.now() - folder_name = created_at.strftime('%Y-%m-%B') - rendered_md_dir = topics_dir / folder_name + # Create assets dir for images. + assets = tops_dir.parent / "assets" / "images" / f"{topic_id}" + assets.mkdir(parents=True, exist_ok=True) + folder = datetime.datetime.fromisoformat(created_at_str.replace("Z", "+00:00")).strftime('%Y-%m-%B') + md_dir = tops_dir / folder + rel_path = os.path.relpath(assets, md_dir) - # Compute the relative path from the markdown file's directory to the assets directory. - topic_relative_path = os.path.relpath(assets_dir, rendered_md_dir) + # Create or truncate the markdown topic file + date_s = str(datetime.datetime.fromisoformat(created_at_str.replace("Z", "+00:00")).date()) + fn = f"{date_s}-{slug}-id{topic_id}.md" + fn = truncate_fn(fn) + topic_md_path = md_dir / fn + topic_md_path.parent.mkdir(parents=True, exist_ok=True) + log.info("Creating markdown file for topic %s at %s", topic_id, topic_md_path) + # Write the topic title as header + with topic_md_path.open(mode="w", encoding="utf8") as f: + f.write(f"# {title}\n\n") - posts = topic_data.get("post_stream", {}).get("posts", []) - if not posts: - log.error("No posts found for topic %s", topic.id) - return None + conv = html2text.HTML2Text() + conv.body_width = 0 - converter = html2text.HTML2Text() - converter.body_width = 0 - md_sections = [] - for post in posts: - created = post.get("created_at", "unknown") - updated = post.get("updated_at", "unknown") - post_number = post.get("post_number", 0) - cooked_html = post.get("cooked", "") - # Pass the corrected topic_relative_path into process_html() - processed_html = process_html(cooked_html, assets_dir, topic_relative_path) - post_md = converter.handle(processed_html) - header_lines = [ - f"**ID:** {topic.id}", - f"**USERNAME:** {post.get('username', 'unknown')}", - f"**POST NUMBER:** {post_number}", - f"**CREATED AT:** {created}", - f"**UPDATED AT:** {updated}", - ] - # Join header lines with two newlines so each appears on its own line in GitHub Markdown. - header = "\n\n".join(header_lines) - section = f"## Post {post_number}\n\n{header}\n\n---\n\n{post_md}" - md_sections.append(section) - full_md = "\n\n".join(md_sections) - topic_title = topic_data.get("title", "No Title") - full_md = f"# {topic_title}\n\n" + full_md + # ---- Modified section: Fetch ALL posts for the topic ---- + # Get posts from topic_meta (first 20 posts) + posts_meta = topic_meta.get("post_stream", {}).get("posts", []) + # Also get the full post stream (IDs) which might include extra post IDs + full_stream = topic_meta.get("post_stream", {}).get("stream", []) + # Identify extra post IDs that might not be in posts_meta + # (Since posts_meta are typically the first 20 posts.) + extra_ids = [pid for pid in full_stream if pid not in [p.get("id") for p in posts_meta]] + log.debug("Topic %s: %d posts in initial load, %d extra IDs detected.", topic_id, len(posts_meta), len(extra_ids)) - topic_obj = Topic.from_json(topic_data, full_md) - saved_relative_path = topic_obj.save_rendered(topics_dir) - log.info("Saved rendered topic %s (%s)", topic_obj.id, topic_obj.slug) - # Return topic info for README. - return { - "id": topic_obj.id, - "slug": topic_obj.slug, - "title": topic_title, - "relative_path": str(saved_relative_path) - } + # Fetch extras in chunks (say, 20 per request) + n = 20 + if extra_ids: + chunks = [extra_ids[i:i+n] for i in range(0, len(extra_ids), n)] + for chunk in chunks: + # Build query string with multiple post_ids[] parameters + qs = "&".join([f"post_ids[]={pid}" for pid in chunk]) + posts_extra_url = f"{site}/t/{topic_id}/posts.json?{qs}" + extra_response = fetch_json(posts_extra_url) + if extra_response and "post_stream" in extra_response and "posts" in extra_response["post_stream"]: + extra_posts = extra_response["post_stream"]["posts"] + posts_meta.extend(extra_posts) + else: + log.warning("Failed fetching extra posts for topic %s with URL: %s", topic_id, posts_extra_url) -# ----- Concurrent Rendering Helper ----- -def render_topics_concurrently(site_url: str, topics: dict, topics_dir: Path, max_workers: int = 8): - """ - Render multiple topics concurrently. - Returns a list of rendered topic information dictionaries. - """ - rendered_topics_info = [] - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = {executor.submit(render_topic, site_url, topic, topics_dir): topic for topic in topics.values()} - for future in concurrent.futures.as_completed(futures): - try: - result = future.result() - if result: - rendered_topics_info.append(result) - # Update the README incrementally after each topic is rendered. - update_readme_incrementally(topics_dir.parent, result) - except Exception as exc: - log.error("A topic generated an exception: %s", exc) - return rendered_topics_info + # Sort posts by (for example) their post_number if available (to preserve original order) + posts_meta.sort(key=lambda p: p.get("post_number", 0)) + # ---- End fetch-all posts section ---- -def update_metadata(metadata_file: Path, metadata: dict): - """Writes the metadata as a JSON file to disk.""" - log.debug("Updating metadata: %s", metadata) - metadata_file.write_text(json.dumps(metadata, indent=2), encoding='utf-8') + # Extract post IDs from the combined posts_meta + post_ids = [post["id"] for post in posts_meta] + log.debug("Processing a total of %d posts for topic %s", len(post_ids), topic_id) -# A helper pattern to match a TOC line (i.e. a line with the topic entry and its id) -TOC_LINE_PATTERN = re.compile( - r"- $(?P.+?)$(?P<relative_path>.+?)$\s*<!--\s*id:\s*(?P<id>\d+)\s*-->") - -# ----- README Update Helpers ----- -def read_existing_readme(repo_root: Path): - """ - Read the existing README.md from repo_root and return a dictionary of topics. - The keys will be the topic IDs (as integers) and the values as the topic dict. - If the file doesn't exist, return an empty dict. - """ - readme_path = repo_root / "README.md" - existing_topics = {} - if readme_path.exists(): + # Now process each post (as before) + for post in posts_meta: try: - content = readme_path.read_text(encoding='utf-8') - # Expecting lines like: - [Topic Title](relative_path) <!-- id: topic_id --> - pattern = TOC_LINE_PATTERN - for line in content.splitlines(): - match = pattern.match(line) - if match: - topic_id = int(match.group("id")) - existing_topics[topic_id] = { - "id": topic_id, - "title": match.group("title"), - "relative_path": match.group("relative_path") - } + post_id = post.get("id") + log.debug("Processing post ID %s for topic %s", post_id, topic_id) + # Create header for the post and fetch necessary dates + cdt = datetime.datetime.fromisoformat(post.get("created_at").replace("Z", "+00:00")) + udt = datetime.datetime.fromisoformat(post.get("updated_at", "").replace("Z", "+00:00")) if post.get("updated_at") else cdt + hdr = (f"> **Post #{post.get('post_number', 0)} • {post.get('username', 'unknown')}**\n" + f"> Created: {cdt.strftime('%Y-%m-%d %H:%M')}\n" + f"> Updated: {udt.strftime('%Y-%m-%d %H:%M')}") + cooked = post.get("cooked", "") + proc = proc_html(cooked, assets, rel_path, topic_id) + md_post = conv.handle(proc) + + # Clean up the markdown post + clean_lines = [] + for l in md_post.splitlines(): + if re.search(r'\S+\s*\d+\s*[×x]\s*\d+\s+\d+(\.\d+)?\s*(KB|MB)$', l, flags=re.IGNORECASE): + continue + clean_lines.append(l) + md_post = "\n".join(clean_lines) + md_post = re.sub(r'(\S+)\s*\d+\s*[×x]\s*\d+\s+\d+(\.\d+)?\s*(KB|MB)', r'\1', md_post, flags=re.IGNORECASE) + + section = f"<!-- ✦✦✦ POST START ✦✦✦ -->\n\n{hdr}\n\n{md_post}\n\n<!-- ✦✦✦ POST END ✦✦✦ -->\n\n" + with topic_md_path.open(mode="a", encoding="utf8") as f: + f.write(section) + log.debug("Appended post #%s (ID %s) to topic markdown file", post.get("post_number", "?"), post_id) + time.sleep(0.2) # to ensure sequential API calls (if needed) except Exception as e: - log.error("Failed to parse existing README.md: %s", e) - return existing_topics + log.error("Error processing post %s: %s", post.get("id"), e) -def update_readme_incrementally(repo_root: Path, new_topic: dict): - """ - Update or create README.md in repo_root by merging the new topic into the existing list. - If the topic already exists, report that. Otherwise, append the new topic to the TOC. - """ - topic_id = new_topic["id"] - existing_topics = read_existing_readme(repo_root) - if topic_id in existing_topics: - log.debug("Topic with id %s already exists in README.md", topic_id) - return - existing_topics[topic_id] = new_topic - append_to_readme(repo_root, new_topic) + # After processing, read the file content and return the topic info. + full_md = topic_md_path.read_text(encoding='utf8') + topic_obj = Topic( + id=topic_id, + slug=slug, + title=title, + category_id=category_id, + created_at_str=created_at_str, + markdown=full_md, + ) + rel_saved = topic_obj.save_rendered(tops_dir) # This rewrites the file; that's acceptable. + log.info("Rendered topic %s (%s) with %d posts", topic_obj.id, topic_obj.slug, len(post_ids)) + return {"id": topic_id, "title": title, "relative_path": str(rel_saved), "category": cats.get(category_id, "Uncategorized")} -def append_to_readme(repo_root: Path, new_topic: dict): - """ - Append a new topic to the existing README.md table-of-contents (TOC). - If README.md doesn't exist, create it with a header and the new topic. - """ - readme_path = repo_root / "README.md" - toc_header = ["# Archived Discourse Topics", "", "## Table of Contents", ""] - new_toc_line = f"- [{new_topic['title']}]({new_topic['relative_path']}) <!-- id: {new_topic['id']} -->" - if readme_path.exists(): +# --- README update functions --- +TOC_PAT = re.compile(r"- $$(?P<title>.+?)$$$(?P<rel>.+?)$ <!-- id: (?P<id>\d+) -->") +def read_readme(root: Path): + rp = root / "README.md" + topics = {} + if rp.exists(): try: - # Read the existing content - content = readme_path.read_text(encoding="utf-8") - lines = content.splitlines() - # Check if the file already has a TOC header by looking for the header marker. + for l in rp.read_text(encoding="utf-8").splitlines(): + m = TOC_PAT.match(l.strip()) + if m: + tid = int(m.group("id")) + topics[tid] = {"id": tid, "title": m.group("title"), "relative_path": m.group("rel")} + except Exception as e: + log.error("Failed parsing README.md: %s", e) + return topics + +def append_readme(root: Path, ntop: dict): + rp = root / "README.md" + header = ["# Archived Discourse Topics", "", "## Table of Contents", ""] + line = f"- [{ntop['title']}]({ntop['relative_path']}) <!-- id: {ntop['id']} -->" + if rp.exists(): + try: + lines = rp.read_text(encoding="utf-8").splitlines() try: - toc_start = lines.index("## Table of Contents") - # Find the blank line after the TOC header if exists - insertion_index = toc_start + 1 - # Advance until we find the first non-TOC line or reach the end. - while ( - insertion_index < len(lines) - and TOC_LINE_PATTERN.match(lines[insertion_index].strip()) - ): - insertion_index += 1 - # Now, insert our new entry just before the first non-TOC line. - lines.insert(insertion_index, new_toc_line) - new_content = "\n".join(lines) + idx = lines.index("## Table of Contents") + 1 + while idx < len(lines) and TOC_PAT.match(lines[idx].strip()): + idx += 1 + lines.insert(idx, line) + newc = "\n".join(lines) except ValueError: - # "## Table of Contents" not found, so we create a new TOC block at the top - new_content = "\n".join(toc_header + [new_toc_line] + [""] + lines) + newc = "\n".join(header + [line] + [""] + lines) except Exception as e: - log.error("Failed to read existing README.md: %s", e) - # In case of error, default to creating a new README.md with header and new topic - new_content = "\n".join(toc_header + [new_toc_line]) + log.error("Error reading README.md: %s", e) + newc = "\n".join(header + [line]) else: - # README.md doesn't exist, create a new one with a standard header and the new TOC entry - new_content = "\n".join(toc_header + [new_toc_line]) - + newc = "\n".join(header + [line]) try: - readme_path.write_text(new_content, encoding="utf-8") - log.info("Updated README.md at %s", readme_path) + rp.write_text(newc, encoding="utf-8") + log.info("Updated README.md at %s", rp) except Exception as e: - log.error("Failed to write README.md: %s", e) + log.error("Failed writing README.md: %s", e) -def write_readme(site_target_dir: Path, topics: dict): - """ - Given a dictionary of topics, write out the full README.md at the site target directory. - """ - readme_path = site_target_dir / "README.md" +def write_readme(site_dir: Path, tops: dict): + rp = site_dir / "README.md" lines = ["# Archived Discourse Topics", "", "## Table of Contents", ""] - sorted_topics = sorted(topics.values(), key=lambda t: t["id"]) - for topic in sorted_topics: - line = f"- [{topic['title']}]({topic['relative_path']}) <!-- id: {topic['id']} -->" - lines.append(line) - content = "\n".join(lines) + group = {} + for t in tops.values(): + group.setdefault(t.get("category", "Uncategorized"), []).append(t) + for cat in sorted(group.keys()): + lines.append(f"### {cat}") + for t in sorted(group[cat], key=lambda x: x["id"]): + lines.append(f"- [{t['title']}]({t['relative_path']}) <!-- id: {t['id']} -->") + lines.append("") try: - readme_path.write_text(content, encoding="utf-8") - log.info("Finalized README.md updated at %s", readme_path) + rp.write_text("\n".join(lines), encoding='utf-8') + log.info("Finalized README.md at %s", rp) except Exception as e: - log.error("Failed to write final README.md: %s", e) + log.error("Failed writing final README.md: %s", e) -# ----- Site Processing Function ----- -def process_site(site_url: str, base_target_dir: Path): +def update_meta(meta_file: Path, meta: dict): + log.debug("Updating meta: %s", meta) + meta_file.write_text(json.dumps(meta, indent=2), encoding='utf-8') + +# --- New function to fetch topic IDs using list topics endpoint --- +def fetch_topic_ids(site: str) -> list: """ - Archive posts and render topics for a single site. - Each site gets its own subdirectory (named for its hostname) inside the base target directory, - and its own metadata file. + Fetch topic IDs from each category using /c/{slug}/{id}.json endpoint. + Returns a list of topic IDs. + """ + topic_ids = set() + # Get categories data + cats_js = fetch_json(f"{site}/categories.json") + if not cats_js: + log.error("Failed to fetch categories from %s", site) + return list(topic_ids) + cats = cats_js.get("category_list", {}).get("categories", []) + for cat in cats: + cat_id = cat.get("id") + cat_slug = cat.get("slug") + if not cat_id or not cat_slug: + continue + url = f"{site}/c/{cat_slug}/{cat_id}.json" + js = fetch_json(url) + if not js: + log.warning("Failed to fetch topics for category %s using %s", cat_id, url) + continue + topics = js.get("topic_list", {}).get("topics", []) + for t in topics: + tid = t.get("id") + if tid: + topic_ids.add(tid) + log.info("Fetched %d topic IDs from %s", len(topic_ids), site) + return list(topic_ids) + +# --- Main processing of a site --- +def process_site(site: str, base: Path): + parsed = urlparse(site) + sname = parsed.hostname or site.replace("https://", "").replace("http://", "").split('/')[0] + log.info("Processing site: %s", site) + sdir = base / sname + posts_d = sdir / 'posts' + tops_d = sdir / 'rendered-topics' + posts_d.mkdir(parents=True, exist_ok=True) + tops_d.mkdir(parents=True, exist_ok=True) + meta_file = sdir / '.metadata.json' + meta = {"archived_topic_ids": {}, "topics": {}} - The README.md is updated incrementally after each topic is rendered. - """ - parsed = urlparse(site_url) - site_name = parsed.hostname or site_url.replace("https://", "").replace("http://", "").split('/')[0] - log.info("Processing site: %s", site_url) - site_target_dir = base_target_dir / site_name - posts_dir = site_target_dir / 'posts' - topics_dir = site_target_dir / 'rendered-topics' - posts_dir.mkdir(parents=True, exist_ok=True) - topics_dir.mkdir(parents=True, exist_ok=True) - metadata_file = site_target_dir / '.metadata.json' - - # Load stored metadata if exists. - metadata = {} - archived_post_ids = set() - if metadata_file.exists(): + if meta_file.exists(): try: - metadata = json.loads(metadata_file.read_text()) - if "archived_post_ids" in metadata: - archived_post_ids = set(int(x) for x in metadata.get('archived_post_ids', [])) + meta = json.loads(meta_file.read_text()) except Exception as e: - log.error("Failed to read/parse metadata file for %s: %s", site_url, e) + log.error("Failed reading meta for %s: %s", site, e) - posts_json = http_get_json(site_url, '/posts.json') - posts = posts_json.get('latest_posts', []) - last_id = None - should_stop = False + rendered_topics = meta.get("topics", {}) + topic_ids_to_process = fetch_topic_ids(site) + log.debug("Topic IDs to process: %s", topic_ids_to_process) - # List to accumulate info for final README generation. - rendered_topics_overall = [] - - while posts: - log.info("Processing %d posts for %s", len(posts), site_url) - topics_to_render = {} # Unique topics in this batch. - for json_post in posts: + rend_all = {} + + with ThreadPoolExecutor(max_workers=10) as executor: + # fetch_cats is needed to provide the category mapping + future_to_tid = {executor.submit(render_topic, site, tid, tops_d, fetch_cats(site)): tid for tid in topic_ids_to_process} + + for future in as_completed(future_to_tid): + tid = future_to_tid[future] try: - post = Post.from_json(json_post) + rendered = future.result() + if rendered: + rend_all[rendered["id"]] = rendered + meta.setdefault("topics", {})[str(rendered["id"])] = rendered + meta.setdefault("archived_topic_ids", {})[str(rendered["id"])] = { + "rendered_at": datetime.datetime.now().isoformat() + } + update_meta(meta_file, meta) + append_readme(sdir, rendered) except Exception as e: - log.warning("Failed to deserialize post %s: %s", json_post, e) - continue - if post.id in archived_post_ids: - log.debug("Post %s already archived, skipping", post.id) - continue - post.save(posts_dir) - archived_post_ids.add(post.id) - last_id = post.id - topic = post.get_topic() - topics_to_render[topic.id] = topic - # Update metadata right away so that already processed posts won't be lost on interrupt. - metadata['archived_post_ids'] = sorted(archived_post_ids) - update_metadata(metadata_file, metadata) - if topics_to_render: - log.info("Rendering %d topics concurrently for %s.", len(topics_to_render), site_url) - rendered = render_topics_concurrently(site_url, topics_to_render, topics_dir, max_workers=8) - rendered_topics_overall.extend(rendered) - if should_stop: - log.info("Stopping pagination loop based on sync date for %s.", site_url) - break - if last_id is None or last_id <= 1: - log.info("No valid last_id found for %s. Ending pagination loop.", site_url) - break - time.sleep(5) - posts = http_get_json(site_url, f'/posts.json?before={last_id - 1}').get('latest_posts', []) - while not posts and last_id and last_id >= 0: - last_id -= 49 - posts = http_get_json(site_url, f'/posts.json?before={last_id}').get('latest_posts', []) - time.sleep(1) + log.error("Error rendering topic %s: %s", tid, e) - # Final merge/update of README from all rendered topics. - if rendered_topics_overall: - existing = read_existing_readme(site_target_dir) - for new_topic in rendered_topics_overall: - if new_topic["id"] not in existing: - existing[new_topic["id"]] = new_topic - write_readme(site_target_dir, existing) + if rend_all: + write_readme(sdir, rend_all) else: - log.info("No topics rendered for %s; skipping final README.md generation.", site_url) + log.info("Site %s: No topics rendered; skipping final README.", site) + update_meta(meta_file, meta) + +def fetch_cats(site: str) -> dict: + """Fetch topic categories using the /categories.json endpoint for now.""" + try: + js = fetch_json(site + "/categories.json") + cats = js.get("category_list", {}).get("categories", []) + mapping = {int(c["id"]): c["name"] for c in cats} + log.info("Fetched %d categories from %s", len(mapping), site) + return mapping + except Exception as e: + log.error("Failed fetch categories from %s: %s", site, e) + return {} def main() -> None: - parameters = args() - base_target_dir = parameters.target_dir - if not isinstance(base_target_dir, Path): - base_target_dir = Path(base_target_dir) - base_target_dir.mkdir(parents=True, exist_ok=True) - sites = parse_sites(parameters.urls) + params = args() + base = params.target_dir if isinstance(params.target_dir, Path) else Path(params.target_dir) + base.mkdir(parents=True, exist_ok=True) + sites = parse_sites(params.urls) if not sites: log.error("No valid sites provided. Exiting.") sys.exit(1) - for site_url in sites: - process_site(site_url, base_target_dir) + for s in sites: + process_site(s, base) if __name__ == "__main__": main()