From c2f1fa934583825fa6f27b2186a465b9549f2059 Mon Sep 17 00:00:00 2001
From: c0mmando <103726157+c0mmando@users.noreply.github.com>
Date: Thu, 3 Apr 2025 02:34:49 +0000
Subject: [PATCH] Refactor code, add features, fix bugs

- Removed duplicate post titles
- Fixed script termination
- Removed duplicates in readme
- Removed double image links
- Clean up post titles
- Organized readme topics by category
- Fix bug preventing archive of more than 20 posts per topic
---
 discourse2github.py | 974 ++++++++++++++++++++++----------------------
 1 file changed, 487 insertions(+), 487 deletions(-)

diff --git a/discourse2github.py b/discourse2github.py
index 8b37072..8a9160c 100644
--- a/discourse2github.py
+++ b/discourse2github.py
@@ -2,19 +2,19 @@
 """
 Archive Discourse posts and render topics to Markdown from multiple sites.
 
-This script downloads posts from one or more Discourse servers via their APIs.
-It archives new posts as JSON files (skipping those already saved or archived),
-renders topics to Markdown files for each batch of posts concurrently (with images
-downloaded and link URLs rewritten as relative paths), updates a metadata file
-after each post is indexed, and then updates a README.md with a table of contents
-linking to each archived topic.
+Uses locally archived JSON posts to render Markdown topics. The API is only used
+to check/newly fetch posts for a topic. The API endpoints used are:
+  - https://{defaultHost}/t/{topic_id}.json (for topic metadata)
+  - https://{defaultHost}/posts/{post_id}.json (for individual posts)
+  - https://{defaultHost}/c/{slug}/{id}.json (for listing topics by category)
 
 Usage:
-  ./discourse2github.py --urls https://forum.hackliberty.org,https://forum.qubes-os.org --target-dir ./archive
+  ./discourse2github.py --urls https://forum.example.org,... --target-dir ./archive
 """
 
 import argparse
 import concurrent.futures
+import datetime
 import functools
 import json
 import logging
@@ -23,594 +23,594 @@ import re
 import sys
 import time
 import urllib.request
-import datetime
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 from urllib.parse import urlparse
 
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
 import html2text  # pip install html2text
 from bs4 import BeautifulSoup  # pip install beautifulsoup4
 
-# Set up logging. If the 'rich' module is available, it will be used.
-loglevel = 'DEBUG' if os.environ.get('DEBUG') else 'INFO'
+# Logging setup: use rich if available.
+lvl = 'DEBUG' if os.environ.get('DEBUG') else 'INFO'
 try:
     from rich.logging import RichHandler
-    logging.basicConfig(level=loglevel, datefmt="[%X]", handlers=[RichHandler()])
+    logging.basicConfig(level=lvl, datefmt="[%X]", handlers=[RichHandler()])
 except ImportError:
-    logging.basicConfig(level=loglevel)
+    logging.basicConfig(level=lvl)
 log = logging.getLogger('archive')
 
-# Argument parser
-parser = argparse.ArgumentParser(
-    description='Archive topics from one or more Discourse installations and render to Markdown')
-parser.add_argument(
-    '--urls',
-    help='Comma-separated URLs of Discourse servers (for example: "https://forum.hackliberty.org,https://forum.qubes-os.org")',
-    default=os.environ.get('DISCOURSE_URLS', 'https://forum.hackliberty.org'))
-parser.add_argument(
-    '--debug', action='store_true', default=os.environ.get('DEBUG', False))
-parser.add_argument(
-    '-t', '--target-dir', help='Target base directory for the archives',
-    default=Path(os.environ.get('TARGET_DIR', './archive')))
+# Config constants
+BATCH_SIZE = 100
+SLEEP_SEC = 2
+MAX_ITER = 1000
+RETRY_MAX = 5  # Maximum retries on error
+
+# Argument Parser
+parser = argparse.ArgumentParser(description='Archive and render Discourse topics.')
+parser.add_argument('--urls', help='Comma-separated Discourse URLs',
+                    default=os.environ.get('DISCOURSE_URLS', 'https://forum.hackliberty.org'))
+parser.add_argument('--debug', action='store_true', default=os.environ.get('DEBUG', False))
+parser.add_argument('-t', '--target-dir', help='Base directory for archives',
+                    default=Path(os.environ.get('TARGET_DIR', './archive')))
 
 @functools.cache
 def args():
     return parser.parse_args()
 
-def parse_sites(urls_string: str) -> list:
-    """Return a list of cleaned-up site URLs."""
-    return [url.strip().rstrip('/') for url in urls_string.split(',') if url.strip()]
+def parse_sites(urls: str) -> list:
+    return [u.strip().rstrip('/') for u in urls.split(',') if u.strip()]
 
-def http_get(site_url: str, path: str, timeout: int = 15) -> str:
-    """Simple HTTP GET with exponential backoff and a timeout."""
-    full_url = f"{site_url}{path}"
-    log.debug("HTTP GET %s", full_url)
+# API credentials (optional)
+API_KEY = os.environ.get("DISCOURSE_API_KEY", "")
+API_USER = os.environ.get("DISCOURSE_API_USERNAME", "")
+
+def fetch_url(url: str, timeout=15) -> str:
+    """
+    Fetch a URL with a retry loop. Logs additional debug info.
+    If a 404 error is encountered, immediately return None.
+    For other errors, wait and retry until RETRY_MAX is reached.
+    """
     backoff = 3
-    while True:
+    attempts = 0
+    req = urllib.request.Request(url)
+    # Add API headers if available.
+    if API_KEY and API_USER:
+        req.add_header("Api-Key", API_KEY)
+        req.add_header("Api-Username", API_USER)
+    while attempts < RETRY_MAX:
         try:
-            with urllib.request.urlopen(full_url, timeout=timeout) as response:
-                return response.read().decode()
-        except Exception as e:
-            log.debug("Error fetching %s: %s -- Retrying in %d seconds", full_url, e, backoff)
+            log.debug("Attempt %d: Fetching URL: %s", attempts + 1, url)
+            with urllib.request.urlopen(req, timeout=timeout) as resp:
+                data = resp.read().decode()
+                log.debug(
+                    "Successfully fetched URL: %s | HTTP Status: %s | Response length: %d bytes",
+                    url, resp.status, len(data)
+                )
+                return data
+        except urllib.error.HTTPError as e:
+            if e.code == 404:
+                log.warning("Resource not found (404) for %s, skipping further retries", url)
+                return None
+            attempts += 1
+            log.warning("HTTPError fetching %s: %s (attempt %d/%d)", url, e, attempts, RETRY_MAX, exc_info=True)
             time.sleep(backoff)
             backoff *= 2
-            if backoff >= 256:
-                log.exception("Rate limit or unrecoverable error for %s", full_url)
-                sys.exit(1)
+        except Exception as e:
+            attempts += 1
+            log.warning("Error fetching %s: %s (attempt %d/%d)", url, e, attempts, RETRY_MAX, exc_info=True)
+            time.sleep(backoff)
+            backoff *= 2
+    log.error("Failed fetching %s after %d attempts.", url, RETRY_MAX)
+    return None
 
-def http_get_json(site_url: str, path: str, timeout: int = 15) -> dict:
-    """Fetch URL contents from a specific site and decode JSON."""
+def fetch_json(url: str, timeout=15) -> dict:
+    """
+    Fetch JSON data from a URL.
+    Logs the received raw data size and the parsed JSON keys where applicable.
+    Returns None if the fetch failed or returned 404.
+    """
+    data = fetch_url(url, timeout)
+    if data is None:
+        log.debug("No data returned for URL: %s", url)
+        return None
+    log.debug("Fetched raw data from %s (length: %d bytes)", url, len(data))
     try:
-        return json.loads(http_get(site_url, path, timeout=timeout))
-    except json.JSONDecodeError:
-        log.warning("Unable to decode JSON response from %r", path)
-        raise
-
-# ----- Helper: Truncate Filename -----
-def truncate_filename(filename: str, max_length: int = 255) -> str:
-    """
-    Truncates the file name to a maximum length (default 255 characters).
-    It preserves the file extension.
-    """
-    if len(filename) <= max_length:
-        return filename
-    p = Path(filename)
-    stem = p.stem
-    suffix = "".join(p.suffixes)
-    max_stem_length = max_length - len(suffix)
-    if max_stem_length <= 0:
-        return filename[:max_length]
-    truncated_stem = stem[:max_stem_length]
-    return truncated_stem + suffix
-
-# ----- Image / Link Download Helpers -----
-def fix_url(original_url: str) -> str:
-    """Fix scheme-relative URLs by prepending https: if necessary."""
-    if original_url.startswith("//"):
-        fixed = "https:" + original_url
-        log.debug("Converted scheme-relative URL: %s -> %s", original_url, fixed)
-        return fixed
-    return original_url
-
-def download_image(image_url: str, dest_path: Path, timeout: int = 15):
-    """
-    Download an image from image_url and save it to dest_path.
-    If the file already exists, skip downloading.
-    A timeout is specified to avoid hanging indefinitely.
-    """
-    if dest_path.exists():
-        log.debug("Image already downloaded: %s", dest_path)
-        return
-    try:
-        log.info("Downloading image: %s", image_url)
-        with urllib.request.urlopen(fix_url(image_url), timeout=timeout) as response:
-            image_data = response.read()
-        dest_path.parent.mkdir(parents=True, exist_ok=True)
-        dest_path.write_bytes(image_data)
-        log.info("Saved image to %s", dest_path)
-    except Exception as e:
-        log.error("Failed to download image %s: %s", image_url, e)
-
-def process_srcset(srcset_value: str, topic_dir: Path, topic_relative_path: str) -> str:
-    """
-    Process a srcset attribute value, downloading images and returning a rewritten value.
-    Downloads every image referenced regardless of URL content.
-    """
-    entries = srcset_value.split(",")
-    fixed_entries = []
-    for entry in entries:
-        parts = entry.strip().split()
-        if not parts:
-            continue
-        orig_url = parts[0]
-        fixed_url = fix_url(orig_url)
-        parsed = urlparse(fixed_url)
-        image_filename = os.path.basename(parsed.path)
-        if not image_filename:
-            log.warning("Skipping srcset URL with empty filename: %s", fixed_url)
-            continue
-        dest_path = topic_dir / image_filename
-        download_image(fixed_url, dest_path)
-        full_path = os.path.join(topic_relative_path, image_filename).replace(os.sep, '/')
-        if len(parts) > 1:
-            fixed_entries.append(f"{full_path} {parts[1]}")
+        js = json.loads(data)
+        if isinstance(js, dict):
+            log.debug("JSON parsed from %s, keys: %s", url, list(js.keys()))
         else:
-            fixed_entries.append(f"{full_path}")
-    return ", ".join(fixed_entries)
+            log.debug("JSON parsed from %s is not a dict (type: %s)", url, type(js).__name__)
+        return js
+    except json.JSONDecodeError as e:
+        log.error("JSON decode error for %s: %s", url, e, exc_info=True)
+        return None
 
-def is_image_link(url: str) -> bool:
-    """Determine if the URL points to an image by its extension."""
-    image_extensions = (".png", ".jpg", ".jpeg", ".gif", ".webp")
-    parsed = urlparse(url)
-    filename = os.path.basename(parsed.path).lower()
-    return filename.endswith(image_extensions)
 
-def process_html(html_content: str, topic_dir: Path, topic_relative_path: str) -> str:
-    """
-    Process the given HTML: download referenced images and rewrite links.
-    Processes both <img> (src, srcset) and <a> tags pointing to images.
-    Downloads every image referenced in the HTML.
-    Returns the modified HTML.
-    """
-    soup = BeautifulSoup(html_content, "html.parser")
+def truncate_fn(name: str, max_len=255) -> str:
+    if len(name) <= max_len:
+        return name
+    p = Path(name)
+    stem, suffix = p.stem, "".join(p.suffixes)
+    allowed = max_len - len(suffix)
+    return (stem[:allowed] if allowed > 0 else name[:max_len]) + suffix
 
-    # Process <img> tags.
+# --- Helpers for images & HTML content ---
+def fix_url(url: str) -> str:
+    return "https:" + url if url.startswith("//") else url
+
+def download_img(url: str, dest: Path, tid: int = None, timeout=15):
+    if dest.exists():
+        log.debug("Img exists for topic %s: %s", tid, dest)
+        return
+    attempts = 0
+    backoff = 2
+    while attempts < RETRY_MAX:
+        try:
+            log.info("Downloading img for topic %s: %s", tid, url)
+            with urllib.request.urlopen(fix_url(url), timeout=timeout) as r:
+                data = r.read()
+            dest.parent.mkdir(parents=True, exist_ok=True)
+            dest.write_bytes(data)
+            log.info("Saved img for topic %s to %s", tid, dest)
+            return
+        except Exception as e:
+            attempts += 1
+            log.warning("Failed downloading img for topic %s from %s: %s (attempt %d/%d)", tid, url, e, attempts, RETRY_MAX)
+            time.sleep(backoff)
+            backoff *= 2
+    log.error("Exceeded maximum retries downloading image %s for topic %s", url, tid)
+
+def proc_srcset(srcset: str, tdir: Path, rel: str, tid: int) -> str:
+    parts = [e.strip() for e in srcset.split(",")]
+    out = []
+    for e in parts:
+        seg = e.split()
+        if not seg:
+            continue
+        orig = seg[0]
+        fixed = fix_url(orig)
+        fname = os.path.basename(urlparse(fixed).path)
+        if not fname:
+            log.warning("Empty filename in srcset for topic %s: %s", tid, fixed)
+            continue
+        dest = tdir / fname
+        download_img(fixed, dest, tid)
+        full = os.path.join(rel, fname).replace(os.sep, '/')
+        out.append(f"{full} {seg[1]}" if len(seg) > 1 else full)
+    return ", ".join(out)
+
+def is_img_link(url: str) -> bool:
+    return os.path.basename(urlparse(url).path).lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".webp"))
+
+def remove_img_anchor(soup):
+    # Remove anchors that wrap images.
+    for a in soup.find_all("a"):
+        if a.find("img"):
+            a.replace_with(*a.contents)
+    return soup
+
+def proc_html(html, tdir: Path, rel: str, tid: int) -> str:
+    soup = BeautifulSoup(html, "html.parser")
+    cnt = 0
     for img in soup.find_all("img"):
         src = img.get("src")
         if src:
             src = fix_url(src)
-            parsed = urlparse(src)
-            image_filename = os.path.basename(parsed.path)
-            if image_filename:
-                dest_path = topic_dir / image_filename
-                download_image(src, dest_path)
-                full_src = os.path.join(topic_relative_path, image_filename).replace(os.sep, '/')
-                img["src"] = full_src
+            fname = os.path.basename(urlparse(src).path)
+            if fname:
+                dest = tdir / fname
+                download_img(src, dest, tid)
+                cnt += 1
+                img["src"] = os.path.join(rel, fname).replace(os.sep, '/')
             else:
-                log.warning("Skipping image with empty filename from src: %s", src)
-        srcset = img.get("srcset")
-        if srcset:
-            new_srcset = process_srcset(srcset, topic_dir, topic_relative_path)
-            img["srcset"] = new_srcset
-
-    # Process <a> tags whose href points to images.
+                log.warning("Empty filename in src for topic %s: %s", tid, src)
+        if s := img.get("srcset"):
+            img["srcset"] = proc_srcset(s, tdir, rel, tid)
     for a in soup.find_all("a"):
         href = a.get("href")
         if href:
-            fixed_href = fix_url(href)
-            if is_image_link(fixed_href):
-                parsed = urlparse(fixed_href)
-                image_filename = os.path.basename(parsed.path)
-                if image_filename:
-                    dest_path = topic_dir / image_filename
-                    download_image(fixed_href, dest_path)
-                    new_href = os.path.join(topic_relative_path, image_filename).replace(os.sep, '/')
-                    a["href"] = new_href
+            fixed = fix_url(href)
+            if is_img_link(fixed):
+                fname = os.path.basename(urlparse(fixed).path)
+                if fname:
+                    dest = tdir / fname
+                    download_img(fixed, dest, tid)
+                    cnt += 1
+                    a["href"] = os.path.join(rel, fname).replace(os.sep, '/')
+                    if a.string:
+                        a.string.replace_with("")
                 else:
-                    log.warning("Skipping link with empty filename from href: %s", fixed_href)
+                    log.warning("Empty filename in href for topic %s: %s", tid, fixed)
+    remove_img_anchor(soup)
+    log.debug("Processed %d images for topic %s", cnt, tid)
     return str(soup)
 
-def slugify(value: str) -> str:
-    """
-    Normalizes string, removes non-alphanumeric characters, and converts whitespace to hyphens.
-    Useful for constructing filenames.
-    """
-    value = str(value)
-    value = value.strip().lower()
-    value = re.sub(r'[^a-z0-9\s-]', '', value)
-    value = re.sub(r'[\s-]+', '-', value)
-    return value or "untitled"
+def slugify(s: str) -> str:
+    s = re.sub(r'[^a-z0-9\s-]', '', s.strip().lower())
+    return re.sub(r'[\s-]+', '-', s) or "untitled"
 
-# ----- Data Models -----
+# --- Data models ---
 @dataclass(frozen=True)
 class PostTopic:
     id: int
     slug: str
     title: str
+    category_id: int
 
-@dataclass(frozen=True)
+@dataclass
 class Post:
     id: int
     slug: str
     raw: dict
 
-    def get_created_at(self) -> datetime.datetime:
+    def created_at(self) -> datetime.datetime:
         return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00"))
 
-    def save(self, dir: Path):
-        """Save the raw JSON post to disk if not already archived."""
+    def updated_at(self) -> datetime.datetime:
+        return datetime.datetime.fromisoformat(self.raw['updated_at'].replace("Z", "+00:00"))
+
+    def save(self, d: Path) -> None:
+        """Save the post JSON to disk (archive)."""
         idstr = str(self.id).zfill(10)
-        filename = f"{idstr}-{self.raw.get('username', 'anonymous')}-{self.raw.get('topic_slug', 'unknown')}.json"
-        filename = truncate_filename(filename)
-        folder_name = self.get_created_at().strftime('%Y-%m-%B')
-        full_path = dir / folder_name / filename
-
-        if full_path.exists():
-            log.debug("Post %s already saved, skipping", self.id)
-            return
-
-        full_path.parent.mkdir(parents=True, exist_ok=True)
-        log.info("Saving post %s to %s", self.id, full_path)
-        full_path.write_text(json.dumps(self.raw, indent=2), encoding='utf-8')
-
-    def get_topic(self) -> PostTopic:
-        return PostTopic(
-            id=self.raw.get('topic_id', self.id),
-            slug=self.raw.get('topic_slug', self.slug),
-            title=self.raw.get('topic_title', self.raw.get('title', 'No Title')),
-        )
+        fn = f"{idstr}-{self.raw.get('username', 'anonymous')}-{self.raw.get('topic_slug', 'unknown')}.json"
+        fn = truncate_fn(fn)
+        folder = self.created_at().strftime('%Y-%m-%B')
+        path = d / folder / fn
+        # Only write if changed.
+        if path.exists():
+            try:
+                ex = json.loads(path.read_text(encoding='utf-8'))
+                if ex.get("updated_at") == self.raw.get("updated_at"):
+                    log.debug("Post %s unchanged; skip saving.", self.id)
+                    return
+            except Exception as e:
+                log.debug("Error reading %s: %s", path, e)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        log.info("Saving post %s to %s", self.id, path)
+        path.write_text(json.dumps(self.raw, indent=2), encoding='utf-8')
 
     @classmethod
     def from_json(cls, j: dict) -> 'Post':
-        return cls(
-            id=j['id'],
-            slug=j.get('topic_slug', 'unknown'),
-            raw=j,
-        )
+        return cls(id=j['id'], slug=j.get('topic_slug', 'unknown'), raw=j)
 
-@dataclass(frozen=True)
+@dataclass
 class Topic:
     id: int
     slug: str
-    raw: dict
-    markdown: str
+    title: str
+    category_id: int
+    created_at_str: str
+    markdown: str = field(default="")  # initial markdown content
 
-    def get_created_at(self) -> datetime.datetime:
-        return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00"))
+    def created_at(self) -> datetime.datetime:
+        return datetime.datetime.fromisoformat(self.created_at_str.replace("Z", "+00:00"))
 
-    def save_rendered(self, dir: Path):
-        """
-        Save the rendered Markdown topic to disk.
-        Filename built from creation date, slug, and id.
-        Truncate the filename if needed.
-        """
-        date_str = str(self.get_created_at().date())
-        filename = f"{date_str}-{self.slug}-id{self.id}.md"
-        filename = truncate_filename(filename)
-        folder_name = self.get_created_at().strftime('%Y-%m-%B')
-        full_path = dir / folder_name / filename
-        full_path.parent.mkdir(parents=True, exist_ok=True)
-        log.info("Saving rendered topic %s to %s", self.id, full_path)
-        rendered_markdown = f"# {self.raw.get('title', 'No Title')}\n\n{self.markdown}"
-        full_path.write_text(rendered_markdown, encoding='utf-8')
-        # Return the relative path from the repository root.
-        return full_path.relative_to(dir.parent)
+    def save_rendered(self, d: Path) -> Path:
+        date_s = str(self.created_at().date())
+        fn = f"{date_s}-{self.slug}-id{self.id}.md"
+        fn = truncate_fn(fn)
+        folder = self.created_at().strftime('%Y-%m-%B')
+        path = d / folder / fn
+        path.parent.mkdir(parents=True, exist_ok=True)
+        log.info("Saving rendered topic %s to %s", self.id, path)
+        path.write_text(self.markdown, encoding='utf-8')
+        return path.relative_to(d.parent)
 
-    @classmethod
-    def from_json(cls, t: dict, markdown: str) -> 'Topic':
-        slug = t.get('slug') or t.get('topic_slug') or "unknown"
-        return cls(
-            id=t.get('id', 0),
-            slug=slug,
-            raw=t,
-            markdown=markdown,
-        )
+# --- API fetching for topics and posts ---
+def fetch_topic_meta(site: str, topic_id: int) -> dict:
+    url = f"{site}/t/{topic_id}.json"
+    result = fetch_json(url)
+    if result is None:
+        log.warning("Topic metadata not found for topic %s", topic_id)
+    return result
 
-# ----- New Helper for Rendering Topics with Image Downloading -----
-def render_topic(site_url: str, topic: PostTopic, topics_dir: Path):
+def fetch_single_post(site: str, post_id: int) -> dict:
     """
-    Render a single topic to Markdown by:
-      1. Fetching the topic JSON.
-      2. Downloading its associated images and rewriting their URLs.
-      3. Converting processed HTML to Markdown (using html2text).
-      4. Saving the rendered Markdown document.
-      
-    Images are saved to an assets directory relative to the site target directory.
-    Returns a dictionary with topic info for README updating.
+    Fetch a single post by post_id from the site.
+    Logs detailed info upon a successful fetch.
     """
-    try:
-        log.info("Fetching topic %s JSON from %s", topic.id, site_url)
-        topic_data = http_get_json(site_url, f"/t/{topic.id}.json")
-    except Exception as e:
-        log.warning("Failed to fetch topic JSON for topic %s: %s", topic.id, e)
+    url = f"{site}/posts/{post_id}.json"
+    result = fetch_json(url)
+    if result is None:
+        log.warning("Post %s not found on site %s", post_id, site)
+    else:
+        # Log detailed post information if available
+        username = result.get("username", "unknown")
+        topic_slug = result.get("topic_slug", "unknown")
+        created_at = result.get("created_at", "unknown time")
+        log.debug("Fetched post %s: topic_slug='%s', username='%s', created_at='%s'", 
+                  post_id, topic_slug, username, created_at)
+        # Optionally, you can also log the whole JSON response or its size:
+        log.debug("Post %s JSON size: %d bytes", post_id, len(json.dumps(result)))
+    return result
+
+# --- Rendering functions using fresh API post data ---
+def render_topic(site: str, topic_id: int, tops_dir: Path, cats: dict) -> dict:
+    """
+    Render each post individually and append it immediately to the topic markdown file.
+    This version fetches EVERY post in the topic (using additional API calls if needed),
+    not just the first 20.
+    """
+    topic_meta = fetch_topic_meta(site, topic_id)
+    if not topic_meta:
+        log.warning("No metadata found for topic %s; skipping render.", topic_id)
         return None
 
-    # Define the assets directory in the repository root.
-    assets_dir = topics_dir.parent / "assets" / "images" / f"{topic.id}"
-    assets_dir.mkdir(parents=True, exist_ok=True)
+    # Use the topic meta from /t/{topic_id}.json
+    slug = topic_meta.get("slug", "unknown")
+    title = topic_meta.get("title", "No Title")
+    category_id = int(topic_meta.get("category_id", 0))
+    created_at_str = topic_meta.get("created_at", datetime.datetime.now().isoformat())
 
-    # Determine the directory where the rendered markdown file will be saved.
-    try:
-        created_at = datetime.datetime.fromisoformat(topic_data['created_at'].replace("Z", "+00:00"))
-    except Exception as e:
-        log.error("Could not parse created_at for topic %s: %s", topic.id, e)
-        created_at = datetime.datetime.now()
-    folder_name = created_at.strftime('%Y-%m-%B')
-    rendered_md_dir = topics_dir / folder_name
+    # Create assets dir for images.
+    assets = tops_dir.parent / "assets" / "images" / f"{topic_id}"
+    assets.mkdir(parents=True, exist_ok=True)
+    folder = datetime.datetime.fromisoformat(created_at_str.replace("Z", "+00:00")).strftime('%Y-%m-%B')
+    md_dir = tops_dir / folder
+    rel_path = os.path.relpath(assets, md_dir)
 
-    # Compute the relative path from the markdown file's directory to the assets directory.
-    topic_relative_path = os.path.relpath(assets_dir, rendered_md_dir)
+    # Create or truncate the markdown topic file
+    date_s = str(datetime.datetime.fromisoformat(created_at_str.replace("Z", "+00:00")).date())
+    fn = f"{date_s}-{slug}-id{topic_id}.md"
+    fn = truncate_fn(fn)
+    topic_md_path = md_dir / fn
+    topic_md_path.parent.mkdir(parents=True, exist_ok=True)
+    log.info("Creating markdown file for topic %s at %s", topic_id, topic_md_path)
+    # Write the topic title as header
+    with topic_md_path.open(mode="w", encoding="utf8") as f:
+        f.write(f"# {title}\n\n")
 
-    posts = topic_data.get("post_stream", {}).get("posts", [])
-    if not posts:
-        log.error("No posts found for topic %s", topic.id)
-        return None
+    conv = html2text.HTML2Text()
+    conv.body_width = 0
 
-    converter = html2text.HTML2Text()
-    converter.body_width = 0
-    md_sections = []
-    for post in posts:
-        created = post.get("created_at", "unknown")
-        updated = post.get("updated_at", "unknown")
-        post_number = post.get("post_number", 0)
-        cooked_html = post.get("cooked", "")
-        # Pass the corrected topic_relative_path into process_html()
-        processed_html = process_html(cooked_html, assets_dir, topic_relative_path)
-        post_md = converter.handle(processed_html)
-        header_lines = [
-            f"**ID:** {topic.id}",
-            f"**USERNAME:** {post.get('username', 'unknown')}",
-            f"**POST NUMBER:** {post_number}",
-            f"**CREATED AT:** {created}",
-            f"**UPDATED AT:** {updated}",
-        ]
-        # Join header lines with two newlines so each appears on its own line in GitHub Markdown.
-        header = "\n\n".join(header_lines)
-        section = f"## Post {post_number}\n\n{header}\n\n---\n\n{post_md}"
-        md_sections.append(section)
-    full_md = "\n\n".join(md_sections)
-    topic_title = topic_data.get("title", "No Title")
-    full_md = f"# {topic_title}\n\n" + full_md
+    # ---- Modified section: Fetch ALL posts for the topic ----
+    # Get posts from topic_meta (first 20 posts)
+    posts_meta = topic_meta.get("post_stream", {}).get("posts", [])
+    # Also get the full post stream (IDs) which might include extra post IDs
+    full_stream = topic_meta.get("post_stream", {}).get("stream", [])
+    # Identify extra post IDs that might not be in posts_meta
+    # (Since posts_meta are typically the first 20 posts.)
+    extra_ids = [pid for pid in full_stream if pid not in [p.get("id") for p in posts_meta]]
+    log.debug("Topic %s: %d posts in initial load, %d extra IDs detected.", topic_id, len(posts_meta), len(extra_ids))
 
-    topic_obj = Topic.from_json(topic_data, full_md)
-    saved_relative_path = topic_obj.save_rendered(topics_dir)
-    log.info("Saved rendered topic %s (%s)", topic_obj.id, topic_obj.slug)
-    # Return topic info for README.
-    return {
-        "id": topic_obj.id,
-        "slug": topic_obj.slug,
-        "title": topic_title,
-        "relative_path": str(saved_relative_path)
-    }
+    # Fetch extras in chunks (say, 20 per request)
+    n = 20
+    if extra_ids:
+        chunks = [extra_ids[i:i+n] for i in range(0, len(extra_ids), n)]
+        for chunk in chunks:
+            # Build query string with multiple post_ids[] parameters
+            qs = "&".join([f"post_ids[]={pid}" for pid in chunk])
+            posts_extra_url = f"{site}/t/{topic_id}/posts.json?{qs}"
+            extra_response = fetch_json(posts_extra_url)
+            if extra_response and "post_stream" in extra_response and "posts" in extra_response["post_stream"]:
+                extra_posts = extra_response["post_stream"]["posts"]
+                posts_meta.extend(extra_posts)
+            else:
+                log.warning("Failed fetching extra posts for topic %s with URL: %s", topic_id, posts_extra_url)
 
-# ----- Concurrent Rendering Helper -----
-def render_topics_concurrently(site_url: str, topics: dict, topics_dir: Path, max_workers: int = 8):
-    """
-    Render multiple topics concurrently.
-    Returns a list of rendered topic information dictionaries.
-    """
-    rendered_topics_info = []
-    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = {executor.submit(render_topic, site_url, topic, topics_dir): topic for topic in topics.values()}
-        for future in concurrent.futures.as_completed(futures):
-            try:
-                result = future.result()
-                if result:
-                    rendered_topics_info.append(result)
-                    # Update the README incrementally after each topic is rendered.
-                    update_readme_incrementally(topics_dir.parent, result)
-            except Exception as exc:
-                log.error("A topic generated an exception: %s", exc)
-    return rendered_topics_info
+    # Sort posts by (for example) their post_number if available (to preserve original order)
+    posts_meta.sort(key=lambda p: p.get("post_number", 0))
+    # ---- End fetch-all posts section ----
 
-def update_metadata(metadata_file: Path, metadata: dict):
-    """Writes the metadata as a JSON file to disk."""
-    log.debug("Updating metadata: %s", metadata)
-    metadata_file.write_text(json.dumps(metadata, indent=2), encoding='utf-8')
+    # Extract post IDs from the combined posts_meta
+    post_ids = [post["id"] for post in posts_meta]
+    log.debug("Processing a total of %d posts for topic %s", len(post_ids), topic_id)
 
-# A helper pattern to match a TOC line (i.e. a line with the topic entry and its id)
-TOC_LINE_PATTERN = re.compile(
-    r"- $(?P<title>.+?)$(?P<relative_path>.+?)$\s*<!--\s*id:\s*(?P<id>\d+)\s*-->")
-
-# ----- README Update Helpers -----
-def read_existing_readme(repo_root: Path):
-    """
-    Read the existing README.md from repo_root and return a dictionary of topics.
-    The keys will be the topic IDs (as integers) and the values as the topic dict.
-    If the file doesn't exist, return an empty dict.
-    """
-    readme_path = repo_root / "README.md"
-    existing_topics = {}
-    if readme_path.exists():
+    # Now process each post (as before)
+    for post in posts_meta:
         try:
-            content = readme_path.read_text(encoding='utf-8')
-            # Expecting lines like: - [Topic Title](relative_path) <!-- id: topic_id -->
-            pattern = TOC_LINE_PATTERN
-            for line in content.splitlines():
-                match = pattern.match(line)
-                if match:
-                    topic_id = int(match.group("id"))
-                    existing_topics[topic_id] = {
-                        "id": topic_id,
-                        "title": match.group("title"),
-                        "relative_path": match.group("relative_path")
-                    }
+            post_id = post.get("id")
+            log.debug("Processing post ID %s for topic %s", post_id, topic_id)
+            # Create header for the post and fetch necessary dates
+            cdt = datetime.datetime.fromisoformat(post.get("created_at").replace("Z", "+00:00"))
+            udt = datetime.datetime.fromisoformat(post.get("updated_at", "").replace("Z", "+00:00")) if post.get("updated_at") else cdt
+            hdr = (f"> **Post #{post.get('post_number', 0)} • {post.get('username', 'unknown')}**\n"
+                   f"> Created: {cdt.strftime('%Y-%m-%d %H:%M')}\n"
+                   f"> Updated: {udt.strftime('%Y-%m-%d %H:%M')}")
+            cooked = post.get("cooked", "")
+            proc = proc_html(cooked, assets, rel_path, topic_id)
+            md_post = conv.handle(proc)
+
+            # Clean up the markdown post
+            clean_lines = []
+            for l in md_post.splitlines():
+                if re.search(r'\S+\s*\d+\s*[×x]\s*\d+\s+\d+(\.\d+)?\s*(KB|MB)$', l, flags=re.IGNORECASE):
+                    continue
+                clean_lines.append(l)
+            md_post = "\n".join(clean_lines)
+            md_post = re.sub(r'(\S+)\s*\d+\s*[×x]\s*\d+\s+\d+(\.\d+)?\s*(KB|MB)', r'\1', md_post, flags=re.IGNORECASE)
+
+            section = f"<!-- ✦✦✦ POST START ✦✦✦ -->\n\n{hdr}\n\n{md_post}\n\n<!-- ✦✦✦ POST END ✦✦✦ -->\n\n"
+            with topic_md_path.open(mode="a", encoding="utf8") as f:
+                f.write(section)
+            log.debug("Appended post #%s (ID %s) to topic markdown file", post.get("post_number", "?"), post_id)
+            time.sleep(0.2)  # to ensure sequential API calls (if needed)
         except Exception as e:
-            log.error("Failed to parse existing README.md: %s", e)
-    return existing_topics
+            log.error("Error processing post %s: %s", post.get("id"), e)
 
-def update_readme_incrementally(repo_root: Path, new_topic: dict):
-    """
-    Update or create README.md in repo_root by merging the new topic into the existing list.
-    If the topic already exists, report that. Otherwise, append the new topic to the TOC.
-    """
-    topic_id = new_topic["id"]
-    existing_topics = read_existing_readme(repo_root)
-    if topic_id in existing_topics:
-        log.debug("Topic with id %s already exists in README.md", topic_id)
-        return
-    existing_topics[topic_id] = new_topic
-    append_to_readme(repo_root, new_topic)
+    # After processing, read the file content and return the topic info.
+    full_md = topic_md_path.read_text(encoding='utf8')
+    topic_obj = Topic(
+        id=topic_id,
+        slug=slug,
+        title=title,
+        category_id=category_id,
+        created_at_str=created_at_str,
+        markdown=full_md,
+    )
+    rel_saved = topic_obj.save_rendered(tops_dir)  # This rewrites the file; that's acceptable.
+    log.info("Rendered topic %s (%s) with %d posts", topic_obj.id, topic_obj.slug, len(post_ids))
+    return {"id": topic_id, "title": title, "relative_path": str(rel_saved), "category": cats.get(category_id, "Uncategorized")}
 
-def append_to_readme(repo_root: Path, new_topic: dict):
-    """
-    Append a new topic to the existing README.md table-of-contents (TOC).
-    If README.md doesn't exist, create it with a header and the new topic.
-    """
-    readme_path = repo_root / "README.md"
-    toc_header = ["# Archived Discourse Topics", "", "## Table of Contents", ""]
-    new_toc_line = f"- [{new_topic['title']}]({new_topic['relative_path']}) <!-- id: {new_topic['id']} -->"
 
-    if readme_path.exists():
+# --- README update functions ---
+TOC_PAT = re.compile(r"- $$(?P<title>.+?)$$$(?P<rel>.+?)$ <!-- id: (?P<id>\d+) -->")
+def read_readme(root: Path):
+    rp = root / "README.md"
+    topics = {}
+    if rp.exists():
         try:
-            # Read the existing content
-            content = readme_path.read_text(encoding="utf-8")
-            lines = content.splitlines()
-            # Check if the file already has a TOC header by looking for the header marker.
+            for l in rp.read_text(encoding="utf-8").splitlines():
+                m = TOC_PAT.match(l.strip())
+                if m:
+                    tid = int(m.group("id"))
+                    topics[tid] = {"id": tid, "title": m.group("title"), "relative_path": m.group("rel")}
+        except Exception as e:
+            log.error("Failed parsing README.md: %s", e)
+    return topics
+
+def append_readme(root: Path, ntop: dict):
+    rp = root / "README.md"
+    header = ["# Archived Discourse Topics", "", "## Table of Contents", ""]
+    line = f"- [{ntop['title']}]({ntop['relative_path']}) <!-- id: {ntop['id']} -->"
+    if rp.exists():
+        try:
+            lines = rp.read_text(encoding="utf-8").splitlines()
             try:
-                toc_start = lines.index("## Table of Contents")
-                # Find the blank line after the TOC header if exists
-                insertion_index = toc_start + 1
-                # Advance until we find the first non-TOC line or reach the end.
-                while (
-                    insertion_index < len(lines)
-                    and TOC_LINE_PATTERN.match(lines[insertion_index].strip())
-                ):
-                    insertion_index += 1
-                # Now, insert our new entry just before the first non-TOC line.
-                lines.insert(insertion_index, new_toc_line)
-                new_content = "\n".join(lines)
+                idx = lines.index("## Table of Contents") + 1
+                while idx < len(lines) and TOC_PAT.match(lines[idx].strip()):
+                    idx += 1
+                lines.insert(idx, line)
+                newc = "\n".join(lines)
             except ValueError:
-                # "## Table of Contents" not found, so we create a new TOC block at the top
-                new_content = "\n".join(toc_header + [new_toc_line] + [""] + lines)
+                newc = "\n".join(header + [line] + [""] + lines)
         except Exception as e:
-            log.error("Failed to read existing README.md: %s", e)
-            # In case of error, default to creating a new README.md with header and new topic
-            new_content = "\n".join(toc_header + [new_toc_line])
+            log.error("Error reading README.md: %s", e)
+            newc = "\n".join(header + [line])
     else:
-        # README.md doesn't exist, create a new one with a standard header and the new TOC entry
-        new_content = "\n".join(toc_header + [new_toc_line])
-    
+        newc = "\n".join(header + [line])
     try:
-        readme_path.write_text(new_content, encoding="utf-8")
-        log.info("Updated README.md at %s", readme_path)
+        rp.write_text(newc, encoding="utf-8")
+        log.info("Updated README.md at %s", rp)
     except Exception as e:
-        log.error("Failed to write README.md: %s", e)
+        log.error("Failed writing README.md: %s", e)
 
-def write_readme(site_target_dir: Path, topics: dict):
-    """
-    Given a dictionary of topics, write out the full README.md at the site target directory.
-    """
-    readme_path = site_target_dir / "README.md"
+def write_readme(site_dir: Path, tops: dict):
+    rp = site_dir / "README.md"
     lines = ["# Archived Discourse Topics", "", "## Table of Contents", ""]
-    sorted_topics = sorted(topics.values(), key=lambda t: t["id"])
-    for topic in sorted_topics:
-        line = f"- [{topic['title']}]({topic['relative_path']}) <!-- id: {topic['id']} -->"
-        lines.append(line)
-    content = "\n".join(lines)
+    group = {}
+    for t in tops.values():
+        group.setdefault(t.get("category", "Uncategorized"), []).append(t)
+    for cat in sorted(group.keys()):
+        lines.append(f"### {cat}")
+        for t in sorted(group[cat], key=lambda x: x["id"]):
+            lines.append(f"- [{t['title']}]({t['relative_path']}) <!-- id: {t['id']} -->")
+        lines.append("")
     try:
-        readme_path.write_text(content, encoding="utf-8")
-        log.info("Finalized README.md updated at %s", readme_path)
+        rp.write_text("\n".join(lines), encoding='utf-8')
+        log.info("Finalized README.md at %s", rp)
     except Exception as e:
-        log.error("Failed to write final README.md: %s", e)
+        log.error("Failed writing final README.md: %s", e)
 
-# ----- Site Processing Function -----
-def process_site(site_url: str, base_target_dir: Path):
+def update_meta(meta_file: Path, meta: dict):
+    log.debug("Updating meta: %s", meta)
+    meta_file.write_text(json.dumps(meta, indent=2), encoding='utf-8')
+
+# --- New function to fetch topic IDs using list topics endpoint ---
+def fetch_topic_ids(site: str) -> list:
     """
-    Archive posts and render topics for a single site.
-    Each site gets its own subdirectory (named for its hostname) inside the base target directory,
-    and its own metadata file.
+    Fetch topic IDs from each category using /c/{slug}/{id}.json endpoint.
+    Returns a list of topic IDs.
+    """
+    topic_ids = set()
+    # Get categories data
+    cats_js = fetch_json(f"{site}/categories.json")
+    if not cats_js:
+        log.error("Failed to fetch categories from %s", site)
+        return list(topic_ids)
+    cats = cats_js.get("category_list", {}).get("categories", [])
+    for cat in cats:
+        cat_id = cat.get("id")
+        cat_slug = cat.get("slug")
+        if not cat_id or not cat_slug:
+            continue
+        url = f"{site}/c/{cat_slug}/{cat_id}.json"
+        js = fetch_json(url)
+        if not js:
+            log.warning("Failed to fetch topics for category %s using %s", cat_id, url)
+            continue
+        topics = js.get("topic_list", {}).get("topics", [])
+        for t in topics:
+            tid = t.get("id")
+            if tid:
+                topic_ids.add(tid)
+    log.info("Fetched %d topic IDs from %s", len(topic_ids), site)
+    return list(topic_ids)
+
+# --- Main processing of a site ---
+def process_site(site: str, base: Path):
+    parsed = urlparse(site)
+    sname = parsed.hostname or site.replace("https://", "").replace("http://", "").split('/')[0]
+    log.info("Processing site: %s", site)
+    sdir = base / sname
+    posts_d = sdir / 'posts'
+    tops_d = sdir / 'rendered-topics'
+    posts_d.mkdir(parents=True, exist_ok=True)
+    tops_d.mkdir(parents=True, exist_ok=True)
+    meta_file = sdir / '.metadata.json'
+    meta = {"archived_topic_ids": {}, "topics": {}}
     
-    The README.md is updated incrementally after each topic is rendered.
-    """
-    parsed = urlparse(site_url)
-    site_name = parsed.hostname or site_url.replace("https://", "").replace("http://", "").split('/')[0]
-    log.info("Processing site: %s", site_url)
-    site_target_dir = base_target_dir / site_name
-    posts_dir = site_target_dir / 'posts'
-    topics_dir = site_target_dir / 'rendered-topics'
-    posts_dir.mkdir(parents=True, exist_ok=True)
-    topics_dir.mkdir(parents=True, exist_ok=True)
-    metadata_file = site_target_dir / '.metadata.json'
-
-    # Load stored metadata if exists.
-    metadata = {}
-    archived_post_ids = set()
-    if metadata_file.exists():
+    if meta_file.exists():
         try:
-            metadata = json.loads(metadata_file.read_text())
-            if "archived_post_ids" in metadata:
-                archived_post_ids = set(int(x) for x in metadata.get('archived_post_ids', []))
+            meta = json.loads(meta_file.read_text())
         except Exception as e:
-            log.error("Failed to read/parse metadata file for %s: %s", site_url, e)
+            log.error("Failed reading meta for %s: %s", site, e)
 
-    posts_json = http_get_json(site_url, '/posts.json')
-    posts = posts_json.get('latest_posts', [])
-    last_id = None
-    should_stop = False
+    rendered_topics = meta.get("topics", {})
+    topic_ids_to_process = fetch_topic_ids(site)
+    log.debug("Topic IDs to process: %s", topic_ids_to_process)
 
-    # List to accumulate info for final README generation.
-    rendered_topics_overall = []
-
-    while posts:
-        log.info("Processing %d posts for %s", len(posts), site_url)
-        topics_to_render = {}  # Unique topics in this batch.
-        for json_post in posts:
+    rend_all = {}
+    
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        # fetch_cats is needed to provide the category mapping
+        future_to_tid = {executor.submit(render_topic, site, tid, tops_d, fetch_cats(site)): tid for tid in topic_ids_to_process}
+        
+        for future in as_completed(future_to_tid):
+            tid = future_to_tid[future]
             try:
-                post = Post.from_json(json_post)
+                rendered = future.result()
+                if rendered:
+                    rend_all[rendered["id"]] = rendered
+                    meta.setdefault("topics", {})[str(rendered["id"])] = rendered
+                    meta.setdefault("archived_topic_ids", {})[str(rendered["id"])] = {
+                        "rendered_at": datetime.datetime.now().isoformat()
+                    }
+                    update_meta(meta_file, meta)
+                    append_readme(sdir, rendered)
             except Exception as e:
-                log.warning("Failed to deserialize post %s: %s", json_post, e)
-                continue
-            if post.id in archived_post_ids:
-                log.debug("Post %s already archived, skipping", post.id)
-                continue
-            post.save(posts_dir)
-            archived_post_ids.add(post.id)
-            last_id = post.id
-            topic = post.get_topic()
-            topics_to_render[topic.id] = topic
-            # Update metadata right away so that already processed posts won't be lost on interrupt.
-            metadata['archived_post_ids'] = sorted(archived_post_ids)
-            update_metadata(metadata_file, metadata)
-        if topics_to_render:
-            log.info("Rendering %d topics concurrently for %s.", len(topics_to_render), site_url)
-            rendered = render_topics_concurrently(site_url, topics_to_render, topics_dir, max_workers=8)
-            rendered_topics_overall.extend(rendered)
-        if should_stop:
-            log.info("Stopping pagination loop based on sync date for %s.", site_url)
-            break
-        if last_id is None or last_id <= 1:
-            log.info("No valid last_id found for %s. Ending pagination loop.", site_url)
-            break
-        time.sleep(5)
-        posts = http_get_json(site_url, f'/posts.json?before={last_id - 1}').get('latest_posts', [])
-        while not posts and last_id and last_id >= 0:
-            last_id -= 49
-            posts = http_get_json(site_url, f'/posts.json?before={last_id}').get('latest_posts', [])
-            time.sleep(1)
+                log.error("Error rendering topic %s: %s", tid, e)
 
-    # Final merge/update of README from all rendered topics.
-    if rendered_topics_overall:
-        existing = read_existing_readme(site_target_dir)
-        for new_topic in rendered_topics_overall:
-            if new_topic["id"] not in existing:
-                existing[new_topic["id"]] = new_topic
-        write_readme(site_target_dir, existing)
+    if rend_all:
+        write_readme(sdir, rend_all)
     else:
-        log.info("No topics rendered for %s; skipping final README.md generation.", site_url)
+        log.info("Site %s: No topics rendered; skipping final README.", site)
+    update_meta(meta_file, meta)
+
+def fetch_cats(site: str) -> dict:
+    """Fetch topic categories using the /categories.json endpoint for now."""
+    try:
+        js = fetch_json(site + "/categories.json")
+        cats = js.get("category_list", {}).get("categories", [])
+        mapping = {int(c["id"]): c["name"] for c in cats}
+        log.info("Fetched %d categories from %s", len(mapping), site)
+        return mapping
+    except Exception as e:
+        log.error("Failed fetch categories from %s: %s", site, e)
+        return {}
 
 def main() -> None:
-    parameters = args()
-    base_target_dir = parameters.target_dir
-    if not isinstance(base_target_dir, Path):
-        base_target_dir = Path(base_target_dir)
-    base_target_dir.mkdir(parents=True, exist_ok=True)
-    sites = parse_sites(parameters.urls)
+    params = args()
+    base = params.target_dir if isinstance(params.target_dir, Path) else Path(params.target_dir)
+    base.mkdir(parents=True, exist_ok=True)
+    sites = parse_sites(params.urls)
     if not sites:
         log.error("No valid sites provided. Exiting.")
         sys.exit(1)
-    for site_url in sites:
-        process_site(site_url, base_target_dir)
+    for s in sites:
+        process_site(s, base)
 
 if __name__ == "__main__":
     main()