From c2f1fa934583825fa6f27b2186a465b9549f2059 Mon Sep 17 00:00:00 2001
From: c0mmando <103726157+c0mmando@users.noreply.github.com>
Date: Thu, 3 Apr 2025 02:34:49 +0000
Subject: [PATCH] Refactor code, add features, fix bugs
- Removed duplicate post titles
- Fixed script termination
- Removed duplicates in readme
- Removed double image links
- Clean up post titles
- Organized readme topics by category
- Fix bug preventing archive of more than 20 posts per topic
---
discourse2github.py | 974 ++++++++++++++++++++++----------------------
1 file changed, 487 insertions(+), 487 deletions(-)
diff --git a/discourse2github.py b/discourse2github.py
index 8b37072..8a9160c 100644
--- a/discourse2github.py
+++ b/discourse2github.py
@@ -2,19 +2,19 @@
"""
Archive Discourse posts and render topics to Markdown from multiple sites.
-This script downloads posts from one or more Discourse servers via their APIs.
-It archives new posts as JSON files (skipping those already saved or archived),
-renders topics to Markdown files for each batch of posts concurrently (with images
-downloaded and link URLs rewritten as relative paths), updates a metadata file
-after each post is indexed, and then updates a README.md with a table of contents
-linking to each archived topic.
+Uses locally archived JSON posts to render Markdown topics. The API is only used
+to check/newly fetch posts for a topic. The API endpoints used are:
+ - https://{defaultHost}/t/{topic_id}.json (for topic metadata)
+ - https://{defaultHost}/posts/{post_id}.json (for individual posts)
+ - https://{defaultHost}/c/{slug}/{id}.json (for listing topics by category)
Usage:
- ./discourse2github.py --urls https://forum.hackliberty.org,https://forum.qubes-os.org --target-dir ./archive
+ ./discourse2github.py --urls https://forum.example.org,... --target-dir ./archive
"""
import argparse
import concurrent.futures
+import datetime
import functools
import json
import logging
@@ -23,594 +23,594 @@ import re
import sys
import time
import urllib.request
-import datetime
-from dataclasses import dataclass
+from dataclasses import dataclass, field
from pathlib import Path
from urllib.parse import urlparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
import html2text # pip install html2text
from bs4 import BeautifulSoup # pip install beautifulsoup4
-# Set up logging. If the 'rich' module is available, it will be used.
-loglevel = 'DEBUG' if os.environ.get('DEBUG') else 'INFO'
+# Logging setup: use rich if available.
+lvl = 'DEBUG' if os.environ.get('DEBUG') else 'INFO'
try:
from rich.logging import RichHandler
- logging.basicConfig(level=loglevel, datefmt="[%X]", handlers=[RichHandler()])
+ logging.basicConfig(level=lvl, datefmt="[%X]", handlers=[RichHandler()])
except ImportError:
- logging.basicConfig(level=loglevel)
+ logging.basicConfig(level=lvl)
log = logging.getLogger('archive')
-# Argument parser
-parser = argparse.ArgumentParser(
- description='Archive topics from one or more Discourse installations and render to Markdown')
-parser.add_argument(
- '--urls',
- help='Comma-separated URLs of Discourse servers (for example: "https://forum.hackliberty.org,https://forum.qubes-os.org")',
- default=os.environ.get('DISCOURSE_URLS', 'https://forum.hackliberty.org'))
-parser.add_argument(
- '--debug', action='store_true', default=os.environ.get('DEBUG', False))
-parser.add_argument(
- '-t', '--target-dir', help='Target base directory for the archives',
- default=Path(os.environ.get('TARGET_DIR', './archive')))
+# Config constants
+BATCH_SIZE = 100
+SLEEP_SEC = 2
+MAX_ITER = 1000
+RETRY_MAX = 5 # Maximum retries on error
+
+# Argument Parser
+parser = argparse.ArgumentParser(description='Archive and render Discourse topics.')
+parser.add_argument('--urls', help='Comma-separated Discourse URLs',
+ default=os.environ.get('DISCOURSE_URLS', 'https://forum.hackliberty.org'))
+parser.add_argument('--debug', action='store_true', default=os.environ.get('DEBUG', False))
+parser.add_argument('-t', '--target-dir', help='Base directory for archives',
+ default=Path(os.environ.get('TARGET_DIR', './archive')))
@functools.cache
def args():
return parser.parse_args()
-def parse_sites(urls_string: str) -> list:
- """Return a list of cleaned-up site URLs."""
- return [url.strip().rstrip('/') for url in urls_string.split(',') if url.strip()]
+def parse_sites(urls: str) -> list:
+ return [u.strip().rstrip('/') for u in urls.split(',') if u.strip()]
-def http_get(site_url: str, path: str, timeout: int = 15) -> str:
- """Simple HTTP GET with exponential backoff and a timeout."""
- full_url = f"{site_url}{path}"
- log.debug("HTTP GET %s", full_url)
+# API credentials (optional)
+API_KEY = os.environ.get("DISCOURSE_API_KEY", "")
+API_USER = os.environ.get("DISCOURSE_API_USERNAME", "")
+
+def fetch_url(url: str, timeout=15) -> str:
+ """
+ Fetch a URL with a retry loop. Logs additional debug info.
+ If a 404 error is encountered, immediately return None.
+ For other errors, wait and retry until RETRY_MAX is reached.
+ """
backoff = 3
- while True:
+ attempts = 0
+ req = urllib.request.Request(url)
+ # Add API headers if available.
+ if API_KEY and API_USER:
+ req.add_header("Api-Key", API_KEY)
+ req.add_header("Api-Username", API_USER)
+ while attempts < RETRY_MAX:
try:
- with urllib.request.urlopen(full_url, timeout=timeout) as response:
- return response.read().decode()
- except Exception as e:
- log.debug("Error fetching %s: %s -- Retrying in %d seconds", full_url, e, backoff)
+ log.debug("Attempt %d: Fetching URL: %s", attempts + 1, url)
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
+ data = resp.read().decode()
+ log.debug(
+ "Successfully fetched URL: %s | HTTP Status: %s | Response length: %d bytes",
+ url, resp.status, len(data)
+ )
+ return data
+ except urllib.error.HTTPError as e:
+ if e.code == 404:
+ log.warning("Resource not found (404) for %s, skipping further retries", url)
+ return None
+ attempts += 1
+ log.warning("HTTPError fetching %s: %s (attempt %d/%d)", url, e, attempts, RETRY_MAX, exc_info=True)
time.sleep(backoff)
backoff *= 2
- if backoff >= 256:
- log.exception("Rate limit or unrecoverable error for %s", full_url)
- sys.exit(1)
+ except Exception as e:
+ attempts += 1
+ log.warning("Error fetching %s: %s (attempt %d/%d)", url, e, attempts, RETRY_MAX, exc_info=True)
+ time.sleep(backoff)
+ backoff *= 2
+ log.error("Failed fetching %s after %d attempts.", url, RETRY_MAX)
+ return None
-def http_get_json(site_url: str, path: str, timeout: int = 15) -> dict:
- """Fetch URL contents from a specific site and decode JSON."""
+def fetch_json(url: str, timeout=15) -> dict:
+ """
+ Fetch JSON data from a URL.
+ Logs the received raw data size and the parsed JSON keys where applicable.
+ Returns None if the fetch failed or returned 404.
+ """
+ data = fetch_url(url, timeout)
+ if data is None:
+ log.debug("No data returned for URL: %s", url)
+ return None
+ log.debug("Fetched raw data from %s (length: %d bytes)", url, len(data))
try:
- return json.loads(http_get(site_url, path, timeout=timeout))
- except json.JSONDecodeError:
- log.warning("Unable to decode JSON response from %r", path)
- raise
-
-# ----- Helper: Truncate Filename -----
-def truncate_filename(filename: str, max_length: int = 255) -> str:
- """
- Truncates the file name to a maximum length (default 255 characters).
- It preserves the file extension.
- """
- if len(filename) <= max_length:
- return filename
- p = Path(filename)
- stem = p.stem
- suffix = "".join(p.suffixes)
- max_stem_length = max_length - len(suffix)
- if max_stem_length <= 0:
- return filename[:max_length]
- truncated_stem = stem[:max_stem_length]
- return truncated_stem + suffix
-
-# ----- Image / Link Download Helpers -----
-def fix_url(original_url: str) -> str:
- """Fix scheme-relative URLs by prepending https: if necessary."""
- if original_url.startswith("//"):
- fixed = "https:" + original_url
- log.debug("Converted scheme-relative URL: %s -> %s", original_url, fixed)
- return fixed
- return original_url
-
-def download_image(image_url: str, dest_path: Path, timeout: int = 15):
- """
- Download an image from image_url and save it to dest_path.
- If the file already exists, skip downloading.
- A timeout is specified to avoid hanging indefinitely.
- """
- if dest_path.exists():
- log.debug("Image already downloaded: %s", dest_path)
- return
- try:
- log.info("Downloading image: %s", image_url)
- with urllib.request.urlopen(fix_url(image_url), timeout=timeout) as response:
- image_data = response.read()
- dest_path.parent.mkdir(parents=True, exist_ok=True)
- dest_path.write_bytes(image_data)
- log.info("Saved image to %s", dest_path)
- except Exception as e:
- log.error("Failed to download image %s: %s", image_url, e)
-
-def process_srcset(srcset_value: str, topic_dir: Path, topic_relative_path: str) -> str:
- """
- Process a srcset attribute value, downloading images and returning a rewritten value.
- Downloads every image referenced regardless of URL content.
- """
- entries = srcset_value.split(",")
- fixed_entries = []
- for entry in entries:
- parts = entry.strip().split()
- if not parts:
- continue
- orig_url = parts[0]
- fixed_url = fix_url(orig_url)
- parsed = urlparse(fixed_url)
- image_filename = os.path.basename(parsed.path)
- if not image_filename:
- log.warning("Skipping srcset URL with empty filename: %s", fixed_url)
- continue
- dest_path = topic_dir / image_filename
- download_image(fixed_url, dest_path)
- full_path = os.path.join(topic_relative_path, image_filename).replace(os.sep, '/')
- if len(parts) > 1:
- fixed_entries.append(f"{full_path} {parts[1]}")
+ js = json.loads(data)
+ if isinstance(js, dict):
+ log.debug("JSON parsed from %s, keys: %s", url, list(js.keys()))
else:
- fixed_entries.append(f"{full_path}")
- return ", ".join(fixed_entries)
+ log.debug("JSON parsed from %s is not a dict (type: %s)", url, type(js).__name__)
+ return js
+ except json.JSONDecodeError as e:
+ log.error("JSON decode error for %s: %s", url, e, exc_info=True)
+ return None
-def is_image_link(url: str) -> bool:
- """Determine if the URL points to an image by its extension."""
- image_extensions = (".png", ".jpg", ".jpeg", ".gif", ".webp")
- parsed = urlparse(url)
- filename = os.path.basename(parsed.path).lower()
- return filename.endswith(image_extensions)
-def process_html(html_content: str, topic_dir: Path, topic_relative_path: str) -> str:
- """
- Process the given HTML: download referenced images and rewrite links.
- Processes both
(src, srcset) and tags pointing to images.
- Downloads every image referenced in the HTML.
- Returns the modified HTML.
- """
- soup = BeautifulSoup(html_content, "html.parser")
+def truncate_fn(name: str, max_len=255) -> str:
+ if len(name) <= max_len:
+ return name
+ p = Path(name)
+ stem, suffix = p.stem, "".join(p.suffixes)
+ allowed = max_len - len(suffix)
+ return (stem[:allowed] if allowed > 0 else name[:max_len]) + suffix
- # Process
tags.
+# --- Helpers for images & HTML content ---
+def fix_url(url: str) -> str:
+ return "https:" + url if url.startswith("//") else url
+
+def download_img(url: str, dest: Path, tid: int = None, timeout=15):
+ if dest.exists():
+ log.debug("Img exists for topic %s: %s", tid, dest)
+ return
+ attempts = 0
+ backoff = 2
+ while attempts < RETRY_MAX:
+ try:
+ log.info("Downloading img for topic %s: %s", tid, url)
+ with urllib.request.urlopen(fix_url(url), timeout=timeout) as r:
+ data = r.read()
+ dest.parent.mkdir(parents=True, exist_ok=True)
+ dest.write_bytes(data)
+ log.info("Saved img for topic %s to %s", tid, dest)
+ return
+ except Exception as e:
+ attempts += 1
+ log.warning("Failed downloading img for topic %s from %s: %s (attempt %d/%d)", tid, url, e, attempts, RETRY_MAX)
+ time.sleep(backoff)
+ backoff *= 2
+ log.error("Exceeded maximum retries downloading image %s for topic %s", url, tid)
+
+def proc_srcset(srcset: str, tdir: Path, rel: str, tid: int) -> str:
+ parts = [e.strip() for e in srcset.split(",")]
+ out = []
+ for e in parts:
+ seg = e.split()
+ if not seg:
+ continue
+ orig = seg[0]
+ fixed = fix_url(orig)
+ fname = os.path.basename(urlparse(fixed).path)
+ if not fname:
+ log.warning("Empty filename in srcset for topic %s: %s", tid, fixed)
+ continue
+ dest = tdir / fname
+ download_img(fixed, dest, tid)
+ full = os.path.join(rel, fname).replace(os.sep, '/')
+ out.append(f"{full} {seg[1]}" if len(seg) > 1 else full)
+ return ", ".join(out)
+
+def is_img_link(url: str) -> bool:
+ return os.path.basename(urlparse(url).path).lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".webp"))
+
+def remove_img_anchor(soup):
+ # Remove anchors that wrap images.
+ for a in soup.find_all("a"):
+ if a.find("img"):
+ a.replace_with(*a.contents)
+ return soup
+
+def proc_html(html, tdir: Path, rel: str, tid: int) -> str:
+ soup = BeautifulSoup(html, "html.parser")
+ cnt = 0
for img in soup.find_all("img"):
src = img.get("src")
if src:
src = fix_url(src)
- parsed = urlparse(src)
- image_filename = os.path.basename(parsed.path)
- if image_filename:
- dest_path = topic_dir / image_filename
- download_image(src, dest_path)
- full_src = os.path.join(topic_relative_path, image_filename).replace(os.sep, '/')
- img["src"] = full_src
+ fname = os.path.basename(urlparse(src).path)
+ if fname:
+ dest = tdir / fname
+ download_img(src, dest, tid)
+ cnt += 1
+ img["src"] = os.path.join(rel, fname).replace(os.sep, '/')
else:
- log.warning("Skipping image with empty filename from src: %s", src)
- srcset = img.get("srcset")
- if srcset:
- new_srcset = process_srcset(srcset, topic_dir, topic_relative_path)
- img["srcset"] = new_srcset
-
- # Process tags whose href points to images.
+ log.warning("Empty filename in src for topic %s: %s", tid, src)
+ if s := img.get("srcset"):
+ img["srcset"] = proc_srcset(s, tdir, rel, tid)
for a in soup.find_all("a"):
href = a.get("href")
if href:
- fixed_href = fix_url(href)
- if is_image_link(fixed_href):
- parsed = urlparse(fixed_href)
- image_filename = os.path.basename(parsed.path)
- if image_filename:
- dest_path = topic_dir / image_filename
- download_image(fixed_href, dest_path)
- new_href = os.path.join(topic_relative_path, image_filename).replace(os.sep, '/')
- a["href"] = new_href
+ fixed = fix_url(href)
+ if is_img_link(fixed):
+ fname = os.path.basename(urlparse(fixed).path)
+ if fname:
+ dest = tdir / fname
+ download_img(fixed, dest, tid)
+ cnt += 1
+ a["href"] = os.path.join(rel, fname).replace(os.sep, '/')
+ if a.string:
+ a.string.replace_with("")
else:
- log.warning("Skipping link with empty filename from href: %s", fixed_href)
+ log.warning("Empty filename in href for topic %s: %s", tid, fixed)
+ remove_img_anchor(soup)
+ log.debug("Processed %d images for topic %s", cnt, tid)
return str(soup)
-def slugify(value: str) -> str:
- """
- Normalizes string, removes non-alphanumeric characters, and converts whitespace to hyphens.
- Useful for constructing filenames.
- """
- value = str(value)
- value = value.strip().lower()
- value = re.sub(r'[^a-z0-9\s-]', '', value)
- value = re.sub(r'[\s-]+', '-', value)
- return value or "untitled"
+def slugify(s: str) -> str:
+ s = re.sub(r'[^a-z0-9\s-]', '', s.strip().lower())
+ return re.sub(r'[\s-]+', '-', s) or "untitled"
-# ----- Data Models -----
+# --- Data models ---
@dataclass(frozen=True)
class PostTopic:
id: int
slug: str
title: str
+ category_id: int
-@dataclass(frozen=True)
+@dataclass
class Post:
id: int
slug: str
raw: dict
- def get_created_at(self) -> datetime.datetime:
+ def created_at(self) -> datetime.datetime:
return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00"))
- def save(self, dir: Path):
- """Save the raw JSON post to disk if not already archived."""
+ def updated_at(self) -> datetime.datetime:
+ return datetime.datetime.fromisoformat(self.raw['updated_at'].replace("Z", "+00:00"))
+
+ def save(self, d: Path) -> None:
+ """Save the post JSON to disk (archive)."""
idstr = str(self.id).zfill(10)
- filename = f"{idstr}-{self.raw.get('username', 'anonymous')}-{self.raw.get('topic_slug', 'unknown')}.json"
- filename = truncate_filename(filename)
- folder_name = self.get_created_at().strftime('%Y-%m-%B')
- full_path = dir / folder_name / filename
-
- if full_path.exists():
- log.debug("Post %s already saved, skipping", self.id)
- return
-
- full_path.parent.mkdir(parents=True, exist_ok=True)
- log.info("Saving post %s to %s", self.id, full_path)
- full_path.write_text(json.dumps(self.raw, indent=2), encoding='utf-8')
-
- def get_topic(self) -> PostTopic:
- return PostTopic(
- id=self.raw.get('topic_id', self.id),
- slug=self.raw.get('topic_slug', self.slug),
- title=self.raw.get('topic_title', self.raw.get('title', 'No Title')),
- )
+ fn = f"{idstr}-{self.raw.get('username', 'anonymous')}-{self.raw.get('topic_slug', 'unknown')}.json"
+ fn = truncate_fn(fn)
+ folder = self.created_at().strftime('%Y-%m-%B')
+ path = d / folder / fn
+ # Only write if changed.
+ if path.exists():
+ try:
+ ex = json.loads(path.read_text(encoding='utf-8'))
+ if ex.get("updated_at") == self.raw.get("updated_at"):
+ log.debug("Post %s unchanged; skip saving.", self.id)
+ return
+ except Exception as e:
+ log.debug("Error reading %s: %s", path, e)
+ path.parent.mkdir(parents=True, exist_ok=True)
+ log.info("Saving post %s to %s", self.id, path)
+ path.write_text(json.dumps(self.raw, indent=2), encoding='utf-8')
@classmethod
def from_json(cls, j: dict) -> 'Post':
- return cls(
- id=j['id'],
- slug=j.get('topic_slug', 'unknown'),
- raw=j,
- )
+ return cls(id=j['id'], slug=j.get('topic_slug', 'unknown'), raw=j)
-@dataclass(frozen=True)
+@dataclass
class Topic:
id: int
slug: str
- raw: dict
- markdown: str
+ title: str
+ category_id: int
+ created_at_str: str
+ markdown: str = field(default="") # initial markdown content
- def get_created_at(self) -> datetime.datetime:
- return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00"))
+ def created_at(self) -> datetime.datetime:
+ return datetime.datetime.fromisoformat(self.created_at_str.replace("Z", "+00:00"))
- def save_rendered(self, dir: Path):
- """
- Save the rendered Markdown topic to disk.
- Filename built from creation date, slug, and id.
- Truncate the filename if needed.
- """
- date_str = str(self.get_created_at().date())
- filename = f"{date_str}-{self.slug}-id{self.id}.md"
- filename = truncate_filename(filename)
- folder_name = self.get_created_at().strftime('%Y-%m-%B')
- full_path = dir / folder_name / filename
- full_path.parent.mkdir(parents=True, exist_ok=True)
- log.info("Saving rendered topic %s to %s", self.id, full_path)
- rendered_markdown = f"# {self.raw.get('title', 'No Title')}\n\n{self.markdown}"
- full_path.write_text(rendered_markdown, encoding='utf-8')
- # Return the relative path from the repository root.
- return full_path.relative_to(dir.parent)
+ def save_rendered(self, d: Path) -> Path:
+ date_s = str(self.created_at().date())
+ fn = f"{date_s}-{self.slug}-id{self.id}.md"
+ fn = truncate_fn(fn)
+ folder = self.created_at().strftime('%Y-%m-%B')
+ path = d / folder / fn
+ path.parent.mkdir(parents=True, exist_ok=True)
+ log.info("Saving rendered topic %s to %s", self.id, path)
+ path.write_text(self.markdown, encoding='utf-8')
+ return path.relative_to(d.parent)
- @classmethod
- def from_json(cls, t: dict, markdown: str) -> 'Topic':
- slug = t.get('slug') or t.get('topic_slug') or "unknown"
- return cls(
- id=t.get('id', 0),
- slug=slug,
- raw=t,
- markdown=markdown,
- )
+# --- API fetching for topics and posts ---
+def fetch_topic_meta(site: str, topic_id: int) -> dict:
+ url = f"{site}/t/{topic_id}.json"
+ result = fetch_json(url)
+ if result is None:
+ log.warning("Topic metadata not found for topic %s", topic_id)
+ return result
-# ----- New Helper for Rendering Topics with Image Downloading -----
-def render_topic(site_url: str, topic: PostTopic, topics_dir: Path):
+def fetch_single_post(site: str, post_id: int) -> dict:
"""
- Render a single topic to Markdown by:
- 1. Fetching the topic JSON.
- 2. Downloading its associated images and rewriting their URLs.
- 3. Converting processed HTML to Markdown (using html2text).
- 4. Saving the rendered Markdown document.
-
- Images are saved to an assets directory relative to the site target directory.
- Returns a dictionary with topic info for README updating.
+ Fetch a single post by post_id from the site.
+ Logs detailed info upon a successful fetch.
"""
- try:
- log.info("Fetching topic %s JSON from %s", topic.id, site_url)
- topic_data = http_get_json(site_url, f"/t/{topic.id}.json")
- except Exception as e:
- log.warning("Failed to fetch topic JSON for topic %s: %s", topic.id, e)
+ url = f"{site}/posts/{post_id}.json"
+ result = fetch_json(url)
+ if result is None:
+ log.warning("Post %s not found on site %s", post_id, site)
+ else:
+ # Log detailed post information if available
+ username = result.get("username", "unknown")
+ topic_slug = result.get("topic_slug", "unknown")
+ created_at = result.get("created_at", "unknown time")
+ log.debug("Fetched post %s: topic_slug='%s', username='%s', created_at='%s'",
+ post_id, topic_slug, username, created_at)
+ # Optionally, you can also log the whole JSON response or its size:
+ log.debug("Post %s JSON size: %d bytes", post_id, len(json.dumps(result)))
+ return result
+
+# --- Rendering functions using fresh API post data ---
+def render_topic(site: str, topic_id: int, tops_dir: Path, cats: dict) -> dict:
+ """
+ Render each post individually and append it immediately to the topic markdown file.
+ This version fetches EVERY post in the topic (using additional API calls if needed),
+ not just the first 20.
+ """
+ topic_meta = fetch_topic_meta(site, topic_id)
+ if not topic_meta:
+ log.warning("No metadata found for topic %s; skipping render.", topic_id)
return None
- # Define the assets directory in the repository root.
- assets_dir = topics_dir.parent / "assets" / "images" / f"{topic.id}"
- assets_dir.mkdir(parents=True, exist_ok=True)
+ # Use the topic meta from /t/{topic_id}.json
+ slug = topic_meta.get("slug", "unknown")
+ title = topic_meta.get("title", "No Title")
+ category_id = int(topic_meta.get("category_id", 0))
+ created_at_str = topic_meta.get("created_at", datetime.datetime.now().isoformat())
- # Determine the directory where the rendered markdown file will be saved.
- try:
- created_at = datetime.datetime.fromisoformat(topic_data['created_at'].replace("Z", "+00:00"))
- except Exception as e:
- log.error("Could not parse created_at for topic %s: %s", topic.id, e)
- created_at = datetime.datetime.now()
- folder_name = created_at.strftime('%Y-%m-%B')
- rendered_md_dir = topics_dir / folder_name
+ # Create assets dir for images.
+ assets = tops_dir.parent / "assets" / "images" / f"{topic_id}"
+ assets.mkdir(parents=True, exist_ok=True)
+ folder = datetime.datetime.fromisoformat(created_at_str.replace("Z", "+00:00")).strftime('%Y-%m-%B')
+ md_dir = tops_dir / folder
+ rel_path = os.path.relpath(assets, md_dir)
- # Compute the relative path from the markdown file's directory to the assets directory.
- topic_relative_path = os.path.relpath(assets_dir, rendered_md_dir)
+ # Create or truncate the markdown topic file
+ date_s = str(datetime.datetime.fromisoformat(created_at_str.replace("Z", "+00:00")).date())
+ fn = f"{date_s}-{slug}-id{topic_id}.md"
+ fn = truncate_fn(fn)
+ topic_md_path = md_dir / fn
+ topic_md_path.parent.mkdir(parents=True, exist_ok=True)
+ log.info("Creating markdown file for topic %s at %s", topic_id, topic_md_path)
+ # Write the topic title as header
+ with topic_md_path.open(mode="w", encoding="utf8") as f:
+ f.write(f"# {title}\n\n")
- posts = topic_data.get("post_stream", {}).get("posts", [])
- if not posts:
- log.error("No posts found for topic %s", topic.id)
- return None
+ conv = html2text.HTML2Text()
+ conv.body_width = 0
- converter = html2text.HTML2Text()
- converter.body_width = 0
- md_sections = []
- for post in posts:
- created = post.get("created_at", "unknown")
- updated = post.get("updated_at", "unknown")
- post_number = post.get("post_number", 0)
- cooked_html = post.get("cooked", "")
- # Pass the corrected topic_relative_path into process_html()
- processed_html = process_html(cooked_html, assets_dir, topic_relative_path)
- post_md = converter.handle(processed_html)
- header_lines = [
- f"**ID:** {topic.id}",
- f"**USERNAME:** {post.get('username', 'unknown')}",
- f"**POST NUMBER:** {post_number}",
- f"**CREATED AT:** {created}",
- f"**UPDATED AT:** {updated}",
- ]
- # Join header lines with two newlines so each appears on its own line in GitHub Markdown.
- header = "\n\n".join(header_lines)
- section = f"## Post {post_number}\n\n{header}\n\n---\n\n{post_md}"
- md_sections.append(section)
- full_md = "\n\n".join(md_sections)
- topic_title = topic_data.get("title", "No Title")
- full_md = f"# {topic_title}\n\n" + full_md
+ # ---- Modified section: Fetch ALL posts for the topic ----
+ # Get posts from topic_meta (first 20 posts)
+ posts_meta = topic_meta.get("post_stream", {}).get("posts", [])
+ # Also get the full post stream (IDs) which might include extra post IDs
+ full_stream = topic_meta.get("post_stream", {}).get("stream", [])
+ # Identify extra post IDs that might not be in posts_meta
+ # (Since posts_meta are typically the first 20 posts.)
+ extra_ids = [pid for pid in full_stream if pid not in [p.get("id") for p in posts_meta]]
+ log.debug("Topic %s: %d posts in initial load, %d extra IDs detected.", topic_id, len(posts_meta), len(extra_ids))
- topic_obj = Topic.from_json(topic_data, full_md)
- saved_relative_path = topic_obj.save_rendered(topics_dir)
- log.info("Saved rendered topic %s (%s)", topic_obj.id, topic_obj.slug)
- # Return topic info for README.
- return {
- "id": topic_obj.id,
- "slug": topic_obj.slug,
- "title": topic_title,
- "relative_path": str(saved_relative_path)
- }
+ # Fetch extras in chunks (say, 20 per request)
+ n = 20
+ if extra_ids:
+ chunks = [extra_ids[i:i+n] for i in range(0, len(extra_ids), n)]
+ for chunk in chunks:
+ # Build query string with multiple post_ids[] parameters
+ qs = "&".join([f"post_ids[]={pid}" for pid in chunk])
+ posts_extra_url = f"{site}/t/{topic_id}/posts.json?{qs}"
+ extra_response = fetch_json(posts_extra_url)
+ if extra_response and "post_stream" in extra_response and "posts" in extra_response["post_stream"]:
+ extra_posts = extra_response["post_stream"]["posts"]
+ posts_meta.extend(extra_posts)
+ else:
+ log.warning("Failed fetching extra posts for topic %s with URL: %s", topic_id, posts_extra_url)
-# ----- Concurrent Rendering Helper -----
-def render_topics_concurrently(site_url: str, topics: dict, topics_dir: Path, max_workers: int = 8):
- """
- Render multiple topics concurrently.
- Returns a list of rendered topic information dictionaries.
- """
- rendered_topics_info = []
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
- futures = {executor.submit(render_topic, site_url, topic, topics_dir): topic for topic in topics.values()}
- for future in concurrent.futures.as_completed(futures):
- try:
- result = future.result()
- if result:
- rendered_topics_info.append(result)
- # Update the README incrementally after each topic is rendered.
- update_readme_incrementally(topics_dir.parent, result)
- except Exception as exc:
- log.error("A topic generated an exception: %s", exc)
- return rendered_topics_info
+ # Sort posts by (for example) their post_number if available (to preserve original order)
+ posts_meta.sort(key=lambda p: p.get("post_number", 0))
+ # ---- End fetch-all posts section ----
-def update_metadata(metadata_file: Path, metadata: dict):
- """Writes the metadata as a JSON file to disk."""
- log.debug("Updating metadata: %s", metadata)
- metadata_file.write_text(json.dumps(metadata, indent=2), encoding='utf-8')
+ # Extract post IDs from the combined posts_meta
+ post_ids = [post["id"] for post in posts_meta]
+ log.debug("Processing a total of %d posts for topic %s", len(post_ids), topic_id)
-# A helper pattern to match a TOC line (i.e. a line with the topic entry and its id)
-TOC_LINE_PATTERN = re.compile(
- r"- $(?P.+?)$(?P.+?)$\s*")
-
-# ----- README Update Helpers -----
-def read_existing_readme(repo_root: Path):
- """
- Read the existing README.md from repo_root and return a dictionary of topics.
- The keys will be the topic IDs (as integers) and the values as the topic dict.
- If the file doesn't exist, return an empty dict.
- """
- readme_path = repo_root / "README.md"
- existing_topics = {}
- if readme_path.exists():
+ # Now process each post (as before)
+ for post in posts_meta:
try:
- content = readme_path.read_text(encoding='utf-8')
- # Expecting lines like: - [Topic Title](relative_path)
- pattern = TOC_LINE_PATTERN
- for line in content.splitlines():
- match = pattern.match(line)
- if match:
- topic_id = int(match.group("id"))
- existing_topics[topic_id] = {
- "id": topic_id,
- "title": match.group("title"),
- "relative_path": match.group("relative_path")
- }
+ post_id = post.get("id")
+ log.debug("Processing post ID %s for topic %s", post_id, topic_id)
+ # Create header for the post and fetch necessary dates
+ cdt = datetime.datetime.fromisoformat(post.get("created_at").replace("Z", "+00:00"))
+ udt = datetime.datetime.fromisoformat(post.get("updated_at", "").replace("Z", "+00:00")) if post.get("updated_at") else cdt
+ hdr = (f"> **Post #{post.get('post_number', 0)} • {post.get('username', 'unknown')}**\n"
+ f"> Created: {cdt.strftime('%Y-%m-%d %H:%M')}\n"
+ f"> Updated: {udt.strftime('%Y-%m-%d %H:%M')}")
+ cooked = post.get("cooked", "")
+ proc = proc_html(cooked, assets, rel_path, topic_id)
+ md_post = conv.handle(proc)
+
+ # Clean up the markdown post
+ clean_lines = []
+ for l in md_post.splitlines():
+ if re.search(r'\S+\s*\d+\s*[×x]\s*\d+\s+\d+(\.\d+)?\s*(KB|MB)$', l, flags=re.IGNORECASE):
+ continue
+ clean_lines.append(l)
+ md_post = "\n".join(clean_lines)
+ md_post = re.sub(r'(\S+)\s*\d+\s*[×x]\s*\d+\s+\d+(\.\d+)?\s*(KB|MB)', r'\1', md_post, flags=re.IGNORECASE)
+
+ section = f"\n\n{hdr}\n\n{md_post}\n\n\n\n"
+ with topic_md_path.open(mode="a", encoding="utf8") as f:
+ f.write(section)
+ log.debug("Appended post #%s (ID %s) to topic markdown file", post.get("post_number", "?"), post_id)
+ time.sleep(0.2) # to ensure sequential API calls (if needed)
except Exception as e:
- log.error("Failed to parse existing README.md: %s", e)
- return existing_topics
+ log.error("Error processing post %s: %s", post.get("id"), e)
-def update_readme_incrementally(repo_root: Path, new_topic: dict):
- """
- Update or create README.md in repo_root by merging the new topic into the existing list.
- If the topic already exists, report that. Otherwise, append the new topic to the TOC.
- """
- topic_id = new_topic["id"]
- existing_topics = read_existing_readme(repo_root)
- if topic_id in existing_topics:
- log.debug("Topic with id %s already exists in README.md", topic_id)
- return
- existing_topics[topic_id] = new_topic
- append_to_readme(repo_root, new_topic)
+ # After processing, read the file content and return the topic info.
+ full_md = topic_md_path.read_text(encoding='utf8')
+ topic_obj = Topic(
+ id=topic_id,
+ slug=slug,
+ title=title,
+ category_id=category_id,
+ created_at_str=created_at_str,
+ markdown=full_md,
+ )
+ rel_saved = topic_obj.save_rendered(tops_dir) # This rewrites the file; that's acceptable.
+ log.info("Rendered topic %s (%s) with %d posts", topic_obj.id, topic_obj.slug, len(post_ids))
+ return {"id": topic_id, "title": title, "relative_path": str(rel_saved), "category": cats.get(category_id, "Uncategorized")}
-def append_to_readme(repo_root: Path, new_topic: dict):
- """
- Append a new topic to the existing README.md table-of-contents (TOC).
- If README.md doesn't exist, create it with a header and the new topic.
- """
- readme_path = repo_root / "README.md"
- toc_header = ["# Archived Discourse Topics", "", "## Table of Contents", ""]
- new_toc_line = f"- [{new_topic['title']}]({new_topic['relative_path']}) "
- if readme_path.exists():
+# --- README update functions ---
+TOC_PAT = re.compile(r"- $$(?P.+?)$$$(?P.+?)$ ")
+def read_readme(root: Path):
+ rp = root / "README.md"
+ topics = {}
+ if rp.exists():
try:
- # Read the existing content
- content = readme_path.read_text(encoding="utf-8")
- lines = content.splitlines()
- # Check if the file already has a TOC header by looking for the header marker.
+ for l in rp.read_text(encoding="utf-8").splitlines():
+ m = TOC_PAT.match(l.strip())
+ if m:
+ tid = int(m.group("id"))
+ topics[tid] = {"id": tid, "title": m.group("title"), "relative_path": m.group("rel")}
+ except Exception as e:
+ log.error("Failed parsing README.md: %s", e)
+ return topics
+
+def append_readme(root: Path, ntop: dict):
+ rp = root / "README.md"
+ header = ["# Archived Discourse Topics", "", "## Table of Contents", ""]
+ line = f"- [{ntop['title']}]({ntop['relative_path']}) "
+ if rp.exists():
+ try:
+ lines = rp.read_text(encoding="utf-8").splitlines()
try:
- toc_start = lines.index("## Table of Contents")
- # Find the blank line after the TOC header if exists
- insertion_index = toc_start + 1
- # Advance until we find the first non-TOC line or reach the end.
- while (
- insertion_index < len(lines)
- and TOC_LINE_PATTERN.match(lines[insertion_index].strip())
- ):
- insertion_index += 1
- # Now, insert our new entry just before the first non-TOC line.
- lines.insert(insertion_index, new_toc_line)
- new_content = "\n".join(lines)
+ idx = lines.index("## Table of Contents") + 1
+ while idx < len(lines) and TOC_PAT.match(lines[idx].strip()):
+ idx += 1
+ lines.insert(idx, line)
+ newc = "\n".join(lines)
except ValueError:
- # "## Table of Contents" not found, so we create a new TOC block at the top
- new_content = "\n".join(toc_header + [new_toc_line] + [""] + lines)
+ newc = "\n".join(header + [line] + [""] + lines)
except Exception as e:
- log.error("Failed to read existing README.md: %s", e)
- # In case of error, default to creating a new README.md with header and new topic
- new_content = "\n".join(toc_header + [new_toc_line])
+ log.error("Error reading README.md: %s", e)
+ newc = "\n".join(header + [line])
else:
- # README.md doesn't exist, create a new one with a standard header and the new TOC entry
- new_content = "\n".join(toc_header + [new_toc_line])
-
+ newc = "\n".join(header + [line])
try:
- readme_path.write_text(new_content, encoding="utf-8")
- log.info("Updated README.md at %s", readme_path)
+ rp.write_text(newc, encoding="utf-8")
+ log.info("Updated README.md at %s", rp)
except Exception as e:
- log.error("Failed to write README.md: %s", e)
+ log.error("Failed writing README.md: %s", e)
-def write_readme(site_target_dir: Path, topics: dict):
- """
- Given a dictionary of topics, write out the full README.md at the site target directory.
- """
- readme_path = site_target_dir / "README.md"
+def write_readme(site_dir: Path, tops: dict):
+ rp = site_dir / "README.md"
lines = ["# Archived Discourse Topics", "", "## Table of Contents", ""]
- sorted_topics = sorted(topics.values(), key=lambda t: t["id"])
- for topic in sorted_topics:
- line = f"- [{topic['title']}]({topic['relative_path']}) "
- lines.append(line)
- content = "\n".join(lines)
+ group = {}
+ for t in tops.values():
+ group.setdefault(t.get("category", "Uncategorized"), []).append(t)
+ for cat in sorted(group.keys()):
+ lines.append(f"### {cat}")
+ for t in sorted(group[cat], key=lambda x: x["id"]):
+ lines.append(f"- [{t['title']}]({t['relative_path']}) ")
+ lines.append("")
try:
- readme_path.write_text(content, encoding="utf-8")
- log.info("Finalized README.md updated at %s", readme_path)
+ rp.write_text("\n".join(lines), encoding='utf-8')
+ log.info("Finalized README.md at %s", rp)
except Exception as e:
- log.error("Failed to write final README.md: %s", e)
+ log.error("Failed writing final README.md: %s", e)
-# ----- Site Processing Function -----
-def process_site(site_url: str, base_target_dir: Path):
+def update_meta(meta_file: Path, meta: dict):
+ log.debug("Updating meta: %s", meta)
+ meta_file.write_text(json.dumps(meta, indent=2), encoding='utf-8')
+
+# --- New function to fetch topic IDs using list topics endpoint ---
+def fetch_topic_ids(site: str) -> list:
"""
- Archive posts and render topics for a single site.
- Each site gets its own subdirectory (named for its hostname) inside the base target directory,
- and its own metadata file.
+ Fetch topic IDs from each category using /c/{slug}/{id}.json endpoint.
+ Returns a list of topic IDs.
+ """
+ topic_ids = set()
+ # Get categories data
+ cats_js = fetch_json(f"{site}/categories.json")
+ if not cats_js:
+ log.error("Failed to fetch categories from %s", site)
+ return list(topic_ids)
+ cats = cats_js.get("category_list", {}).get("categories", [])
+ for cat in cats:
+ cat_id = cat.get("id")
+ cat_slug = cat.get("slug")
+ if not cat_id or not cat_slug:
+ continue
+ url = f"{site}/c/{cat_slug}/{cat_id}.json"
+ js = fetch_json(url)
+ if not js:
+ log.warning("Failed to fetch topics for category %s using %s", cat_id, url)
+ continue
+ topics = js.get("topic_list", {}).get("topics", [])
+ for t in topics:
+ tid = t.get("id")
+ if tid:
+ topic_ids.add(tid)
+ log.info("Fetched %d topic IDs from %s", len(topic_ids), site)
+ return list(topic_ids)
+
+# --- Main processing of a site ---
+def process_site(site: str, base: Path):
+ parsed = urlparse(site)
+ sname = parsed.hostname or site.replace("https://", "").replace("http://", "").split('/')[0]
+ log.info("Processing site: %s", site)
+ sdir = base / sname
+ posts_d = sdir / 'posts'
+ tops_d = sdir / 'rendered-topics'
+ posts_d.mkdir(parents=True, exist_ok=True)
+ tops_d.mkdir(parents=True, exist_ok=True)
+ meta_file = sdir / '.metadata.json'
+ meta = {"archived_topic_ids": {}, "topics": {}}
- The README.md is updated incrementally after each topic is rendered.
- """
- parsed = urlparse(site_url)
- site_name = parsed.hostname or site_url.replace("https://", "").replace("http://", "").split('/')[0]
- log.info("Processing site: %s", site_url)
- site_target_dir = base_target_dir / site_name
- posts_dir = site_target_dir / 'posts'
- topics_dir = site_target_dir / 'rendered-topics'
- posts_dir.mkdir(parents=True, exist_ok=True)
- topics_dir.mkdir(parents=True, exist_ok=True)
- metadata_file = site_target_dir / '.metadata.json'
-
- # Load stored metadata if exists.
- metadata = {}
- archived_post_ids = set()
- if metadata_file.exists():
+ if meta_file.exists():
try:
- metadata = json.loads(metadata_file.read_text())
- if "archived_post_ids" in metadata:
- archived_post_ids = set(int(x) for x in metadata.get('archived_post_ids', []))
+ meta = json.loads(meta_file.read_text())
except Exception as e:
- log.error("Failed to read/parse metadata file for %s: %s", site_url, e)
+ log.error("Failed reading meta for %s: %s", site, e)
- posts_json = http_get_json(site_url, '/posts.json')
- posts = posts_json.get('latest_posts', [])
- last_id = None
- should_stop = False
+ rendered_topics = meta.get("topics", {})
+ topic_ids_to_process = fetch_topic_ids(site)
+ log.debug("Topic IDs to process: %s", topic_ids_to_process)
- # List to accumulate info for final README generation.
- rendered_topics_overall = []
-
- while posts:
- log.info("Processing %d posts for %s", len(posts), site_url)
- topics_to_render = {} # Unique topics in this batch.
- for json_post in posts:
+ rend_all = {}
+
+ with ThreadPoolExecutor(max_workers=10) as executor:
+ # fetch_cats is needed to provide the category mapping
+ future_to_tid = {executor.submit(render_topic, site, tid, tops_d, fetch_cats(site)): tid for tid in topic_ids_to_process}
+
+ for future in as_completed(future_to_tid):
+ tid = future_to_tid[future]
try:
- post = Post.from_json(json_post)
+ rendered = future.result()
+ if rendered:
+ rend_all[rendered["id"]] = rendered
+ meta.setdefault("topics", {})[str(rendered["id"])] = rendered
+ meta.setdefault("archived_topic_ids", {})[str(rendered["id"])] = {
+ "rendered_at": datetime.datetime.now().isoformat()
+ }
+ update_meta(meta_file, meta)
+ append_readme(sdir, rendered)
except Exception as e:
- log.warning("Failed to deserialize post %s: %s", json_post, e)
- continue
- if post.id in archived_post_ids:
- log.debug("Post %s already archived, skipping", post.id)
- continue
- post.save(posts_dir)
- archived_post_ids.add(post.id)
- last_id = post.id
- topic = post.get_topic()
- topics_to_render[topic.id] = topic
- # Update metadata right away so that already processed posts won't be lost on interrupt.
- metadata['archived_post_ids'] = sorted(archived_post_ids)
- update_metadata(metadata_file, metadata)
- if topics_to_render:
- log.info("Rendering %d topics concurrently for %s.", len(topics_to_render), site_url)
- rendered = render_topics_concurrently(site_url, topics_to_render, topics_dir, max_workers=8)
- rendered_topics_overall.extend(rendered)
- if should_stop:
- log.info("Stopping pagination loop based on sync date for %s.", site_url)
- break
- if last_id is None or last_id <= 1:
- log.info("No valid last_id found for %s. Ending pagination loop.", site_url)
- break
- time.sleep(5)
- posts = http_get_json(site_url, f'/posts.json?before={last_id - 1}').get('latest_posts', [])
- while not posts and last_id and last_id >= 0:
- last_id -= 49
- posts = http_get_json(site_url, f'/posts.json?before={last_id}').get('latest_posts', [])
- time.sleep(1)
+ log.error("Error rendering topic %s: %s", tid, e)
- # Final merge/update of README from all rendered topics.
- if rendered_topics_overall:
- existing = read_existing_readme(site_target_dir)
- for new_topic in rendered_topics_overall:
- if new_topic["id"] not in existing:
- existing[new_topic["id"]] = new_topic
- write_readme(site_target_dir, existing)
+ if rend_all:
+ write_readme(sdir, rend_all)
else:
- log.info("No topics rendered for %s; skipping final README.md generation.", site_url)
+ log.info("Site %s: No topics rendered; skipping final README.", site)
+ update_meta(meta_file, meta)
+
+def fetch_cats(site: str) -> dict:
+ """Fetch topic categories using the /categories.json endpoint for now."""
+ try:
+ js = fetch_json(site + "/categories.json")
+ cats = js.get("category_list", {}).get("categories", [])
+ mapping = {int(c["id"]): c["name"] for c in cats}
+ log.info("Fetched %d categories from %s", len(mapping), site)
+ return mapping
+ except Exception as e:
+ log.error("Failed fetch categories from %s: %s", site, e)
+ return {}
def main() -> None:
- parameters = args()
- base_target_dir = parameters.target_dir
- if not isinstance(base_target_dir, Path):
- base_target_dir = Path(base_target_dir)
- base_target_dir.mkdir(parents=True, exist_ok=True)
- sites = parse_sites(parameters.urls)
+ params = args()
+ base = params.target_dir if isinstance(params.target_dir, Path) else Path(params.target_dir)
+ base.mkdir(parents=True, exist_ok=True)
+ sites = parse_sites(params.urls)
if not sites:
log.error("No valid sites provided. Exiting.")
sys.exit(1)
- for site_url in sites:
- process_site(site_url, base_target_dir)
+ for s in sites:
+ process_site(s, base)
if __name__ == "__main__":
main()