diff --git a/archive.py b/archive.py new file mode 100644 index 0000000..4865298 --- /dev/null +++ b/archive.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python3 +""" +Archive Discourse posts and render topics to Markdown from multiple sites. + +This script downloads posts from one or more Discourse servers via their APIs, +archives new posts as JSON files (skipping those already saved or archived), +renders topics to Markdown files for each batch of posts concurrently, and updates +a metadata file after each post is indexed. + +Usage: + ./archive_and_render.py --urls https://forum.hackliberty.org,https://forum.qubes-os.org --target-dir ./archive +""" + +import argparse +import concurrent.futures +import functools +import json +import logging +import os +import sys +import time +import urllib.request +import datetime +from dataclasses import dataclass +from pathlib import Path +from urllib.parse import urlparse + +# Set up logging. If the 'rich' module is available, it will be used. +loglevel = 'DEBUG' if os.environ.get('DEBUG') else 'INFO' +try: + from rich.logging import RichHandler + logging.basicConfig(level=loglevel, datefmt="[%X]", handlers=[RichHandler()]) +except ImportError: + logging.basicConfig(level=loglevel) +log = logging.getLogger('archive_and_render') + +# Argument parser (cached for re-use) +parser = argparse.ArgumentParser( + description='Archive topics from one or more Discourse installations and render to markdown') +parser.add_argument( + '--urls', + help='Comma-separated URLs of Discourse servers (for example: "https://forum.hackliberty.org,https://forum.qubes-os.org")', + default=os.environ.get('DISCOURSE_URLS', 'https://forum.hackliberty.org')) +parser.add_argument( + '--debug', action='store_true', default=os.environ.get('DEBUG', False)) +parser.add_argument( + '-t', '--target-dir', help='Target base directory for the archives', + default=Path(os.environ.get('TARGET_DIR', './archive'))) + +@functools.cache +def args(): + return parser.parse_args() + +def parse_sites(urls_string: str) -> list: + """Return a list of cleaned-up site URLs.""" + return [url.strip().rstrip('/') for url in urls_string.split(',') if url.strip()] + +def http_get(site_url: str, path: str) -> str: + """Simple HTTP GET with exponential backoff.""" + full_url = f"{site_url}{path}" + log.debug("HTTP GET %s", full_url) + backoff = 3 + while True: + try: + with urllib.request.urlopen(full_url) as response: + return response.read().decode() + except Exception as e: + log.debug("Error fetching %s: %s -- Retrying in %d seconds", full_url, e, backoff) + time.sleep(backoff) + backoff *= 2 + if backoff >= 256: + log.exception("Rate limit or unrecoverable error for %s", full_url) + sys.exit(1) + +def http_get_json(site_url: str, path: str) -> dict: + """Fetch URL contents from a specific site and decode JSON.""" + try: + return json.loads(http_get(site_url, path)) + except json.JSONDecodeError: + log.warning("Unable to decode JSON response from %r", path) + raise + +# ----- Data Models ----- + +@dataclass(frozen=True) +class PostTopic: + id: int + slug: str + title: str + +@dataclass(frozen=True) +class Post: + id: int + slug: str + raw: dict + + def get_created_at(self) -> datetime.datetime: + return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00")) + + def save(self, dir: Path): + """Save the raw JSON post to disk if not already archived.""" + idstr = str(self.id).zfill(10) + filename = f"{idstr}-{self.raw.get('username', 'anonymous')}-{self.raw.get('topic_slug', 'unknown')}.json" + folder_name = self.get_created_at().strftime('%Y-%m-%B') + full_path = dir / folder_name / filename + + if full_path.exists(): + log.debug("Post %s already saved, skipping", self.id) + return + + full_path.parent.mkdir(parents=True, exist_ok=True) + log.info("Saving post %s to %s", self.id, full_path) + full_path.write_text(json.dumps(self.raw, indent=2), encoding='utf-8') + + def get_topic(self) -> PostTopic: + return PostTopic( + id=self.raw.get('topic_id', self.id), + slug=self.raw.get('topic_slug', self.slug), + title=self.raw.get('topic_title', self.raw.get('title', 'No Title')), + ) + + @classmethod + def from_json(cls, j: dict) -> 'Post': + return cls( + id=j['id'], + slug=j.get('topic_slug', 'unknown'), + raw=j, + ) + +@dataclass(frozen=True) +class Topic: + id: int + slug: str + raw: dict + markdown: str + + def get_created_at(self) -> datetime.datetime: + return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00")) + + def save_rendered(self, dir: Path): + """ + Save the rendered Markdown topic to disk. + Filename built from creation date, slug, and id. + """ + date_str = str(self.get_created_at().date()) + filename = f"{date_str}-{self.slug}-id{self.id}.md" + folder_name = self.get_created_at().strftime('%Y-%m-%B') + full_path = dir / folder_name / filename + full_path.parent.mkdir(parents=True, exist_ok=True) + log.info("Saving rendered topic %s to %s", self.id, full_path) + rendered_markdown = f"# {self.raw.get('title', 'No Title')}\n\n{self.markdown}" + full_path.write_text(rendered_markdown, encoding='utf-8') + + @classmethod + def from_json(cls, t: dict, markdown: str) -> 'Topic': + slug = t.get('slug') or t.get('topic_slug') or "unknown" + return cls( + id=t.get('id', 0), + slug=slug, + raw=t, + markdown=markdown, + ) + +# ----- Helper Functions ----- + +def update_metadata(metadata_file: Path, metadata: dict): + """Writes the metadata as a JSON file to disk.""" + log.debug("Updating metadata: %s", metadata) + metadata_file.write_text(json.dumps(metadata, indent=2), encoding='utf-8') + +def render_topic(site_url: str, topic: PostTopic, topics_dir: Path): + """ + Render a single topic to Markdown. + Fetches the topic JSON and its raw Markdown (including additional pages if available). + """ + try: + log.info("Fetching topic %s JSON from %s", topic.id, site_url) + topic_data = http_get_json(site_url, f"/t/{topic.id}.json") + except Exception as e: + log.warning("Failed to fetch topic JSON for topic %s: %s", topic.id, e) + return + + log.info("Fetching raw markdown for topic %s from %s", topic.id, site_url) + body = http_get(site_url, f"/raw/{topic.id}") + if not body: + log.warning("Could not retrieve markdown body for topic %s", topic.id) + return + + # Assemble additional pages if available. + page_num = 2 + while True: + more_body = http_get(site_url, f"/raw/{topic.id}?page={page_num}") + if not more_body: + break + body += f"\n{more_body}" + page_num += 1 + + try: + topic_obj = Topic.from_json(topic_data, body) + except Exception as e: + log.error("Failed to create Topic object for topic %s: %s", topic.id, e) + return + + topic_obj.save_rendered(topics_dir) + log.info("Saved rendered topic %s (%s)", topic_obj.id, topic_obj.slug) + +def render_topics_concurrently(site_url: str, topics: dict, topics_dir: Path, max_workers: int = 8): + """ + Render multiple topics concurrently. + topics: a dictionary of topic_id -> PostTopic. + """ + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(render_topic, site_url, topic, topics_dir) for topic in topics.values()] + for future in concurrent.futures.as_completed(futures): + try: + future.result() + except Exception as exc: + log.error("A topic generated an exception: %s", exc) + +def process_site(site_url: str, base_target_dir: Path): + """ + Archive posts and render topics for a single site. + Each site gets its own subdirectory (named for its hostname) inside the base target directory, + and its own metadata file. + """ + parsed = urlparse(site_url) + site_name = parsed.hostname or site_url.replace("https://", "").replace("http://", "").split('/')[0] + log.info("Processing site: %s", site_url) + site_target_dir = base_target_dir / site_name + posts_dir = site_target_dir / 'posts' + topics_dir = site_target_dir / 'rendered-topics' + posts_dir.mkdir(parents=True, exist_ok=True) + topics_dir.mkdir(parents=True, exist_ok=True) + metadata_file = site_target_dir / '.metadata.json' + + # Load stored metadata if it exists. + metadata = {} + last_sync_date = None + archived_post_ids = set() + if metadata_file.exists(): + try: + metadata = json.loads(metadata_file.read_text()) + if "last_sync_date" in metadata: + last_sync_date = datetime.datetime.fromisoformat(metadata.get('last_sync_date')) + if "archived_post_ids" in metadata: + archived_post_ids = set(int(x) for x in metadata.get('archived_post_ids', [])) + except Exception as e: + log.error("Failed to read/parse metadata file for %s: %s", site_url, e) + + if last_sync_date: + # Step back one day to catch updates. + last_sync_date -= datetime.timedelta(days=1) + log.info("Resyncing posts from %s for %s", last_sync_date.isoformat(), site_url) + + posts_json = http_get_json(site_url, '/posts.json') + posts = posts_json.get('latest_posts', []) + last_id = None + max_created_at = last_sync_date + should_stop = False + + while posts: + log.info("Processing %d posts for %s", len(posts), site_url) + topics_to_render = {} # unique topics in this batch + for json_post in posts: + try: + post = Post.from_json(json_post) + except Exception as e: + log.warning("Failed to deserialize post %s: %s", json_post, e) + continue + + if post.id in archived_post_ids: + log.debug("Post %s already archived, skipping", post.id) + continue + + post_created = post.get_created_at() + if last_sync_date is not None and post_created < last_sync_date: + log.info("Post %s is older than last_sync_date; stopping batch for %s.", post.id, site_url) + should_stop = True + break + + post.save(posts_dir) + archived_post_ids.add(post.id) + last_id = post.id + + topic = post.get_topic() + topics_to_render[topic.id] = topic + + if max_created_at is None or post_created > max_created_at: + max_created_at = post_created + + metadata['last_sync_date'] = max_created_at.isoformat() if max_created_at else None + metadata['archived_post_ids'] = sorted(archived_post_ids) + update_metadata(metadata_file, metadata) + + # Render topics concurrently for the current batch. + if topics_to_render: + log.info("Rendering %d topics concurrently for %s.", len(topics_to_render), site_url) + render_topics_concurrently(site_url, topics_to_render, topics_dir, max_workers=8) + + if should_stop: + log.info("Stopping pagination loop based on sync date for %s.", site_url) + break + + if last_id is None or last_id <= 1: + log.info("No valid last_id found for %s. Ending pagination loop.", site_url) + break + + time.sleep(5) + posts = http_get_json(site_url, f'/posts.json?before={last_id - 1}').get('latest_posts', []) + # Fallback if posts come empty (step back gradually) + while not posts and last_id and last_id >= 0: + last_id -= 49 + posts = http_get_json(site_url, f'/posts.json?before={last_id}').get('latest_posts', []) + time.sleep(1) + +def main() -> None: + # Parse command-line parameters. + parameters = args() + base_target_dir = parameters.target_dir + if not isinstance(base_target_dir, Path): + base_target_dir = Path(base_target_dir) + base_target_dir.mkdir(parents=True, exist_ok=True) + + sites = parse_sites(parameters.urls) + if not sites: + log.error("No valid sites provided. Exiting.") + sys.exit(1) + + # Process each site. + for site_url in sites: + process_site(site_url, base_target_dir) + +if __name__ == "__main__": + main()