From 8c89c6309a2a199e99d0937c66f358a7efe1ff5d Mon Sep 17 00:00:00 2001
From: c0mmando <103726157+c0mmando@users.noreply.github.com>
Date: Wed, 5 Mar 2025 00:45:37 +0000
Subject: [PATCH] Create discourse2github.py
---
discourse2github.py | 616 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 616 insertions(+)
create mode 100644 discourse2github.py
diff --git a/discourse2github.py b/discourse2github.py
new file mode 100644
index 0000000..8b37072
--- /dev/null
+++ b/discourse2github.py
@@ -0,0 +1,616 @@
+#!/usr/bin/env python3
+"""
+Archive Discourse posts and render topics to Markdown from multiple sites.
+
+This script downloads posts from one or more Discourse servers via their APIs.
+It archives new posts as JSON files (skipping those already saved or archived),
+renders topics to Markdown files for each batch of posts concurrently (with images
+downloaded and link URLs rewritten as relative paths), updates a metadata file
+after each post is indexed, and then updates a README.md with a table of contents
+linking to each archived topic.
+
+Usage:
+ ./discourse2github.py --urls https://forum.hackliberty.org,https://forum.qubes-os.org --target-dir ./archive
+"""
+
+import argparse
+import concurrent.futures
+import functools
+import json
+import logging
+import os
+import re
+import sys
+import time
+import urllib.request
+import datetime
+from dataclasses import dataclass
+from pathlib import Path
+from urllib.parse import urlparse
+
+import html2text # pip install html2text
+from bs4 import BeautifulSoup # pip install beautifulsoup4
+
+# Set up logging. If the 'rich' module is available, it will be used.
+loglevel = 'DEBUG' if os.environ.get('DEBUG') else 'INFO'
+try:
+ from rich.logging import RichHandler
+ logging.basicConfig(level=loglevel, datefmt="[%X]", handlers=[RichHandler()])
+except ImportError:
+ logging.basicConfig(level=loglevel)
+log = logging.getLogger('archive')
+
+# Argument parser
+parser = argparse.ArgumentParser(
+ description='Archive topics from one or more Discourse installations and render to Markdown')
+parser.add_argument(
+ '--urls',
+ help='Comma-separated URLs of Discourse servers (for example: "https://forum.hackliberty.org,https://forum.qubes-os.org")',
+ default=os.environ.get('DISCOURSE_URLS', 'https://forum.hackliberty.org'))
+parser.add_argument(
+ '--debug', action='store_true', default=os.environ.get('DEBUG', False))
+parser.add_argument(
+ '-t', '--target-dir', help='Target base directory for the archives',
+ default=Path(os.environ.get('TARGET_DIR', './archive')))
+
+@functools.cache
+def args():
+ return parser.parse_args()
+
+def parse_sites(urls_string: str) -> list:
+ """Return a list of cleaned-up site URLs."""
+ return [url.strip().rstrip('/') for url in urls_string.split(',') if url.strip()]
+
+def http_get(site_url: str, path: str, timeout: int = 15) -> str:
+ """Simple HTTP GET with exponential backoff and a timeout."""
+ full_url = f"{site_url}{path}"
+ log.debug("HTTP GET %s", full_url)
+ backoff = 3
+ while True:
+ try:
+ with urllib.request.urlopen(full_url, timeout=timeout) as response:
+ return response.read().decode()
+ except Exception as e:
+ log.debug("Error fetching %s: %s -- Retrying in %d seconds", full_url, e, backoff)
+ time.sleep(backoff)
+ backoff *= 2
+ if backoff >= 256:
+ log.exception("Rate limit or unrecoverable error for %s", full_url)
+ sys.exit(1)
+
+def http_get_json(site_url: str, path: str, timeout: int = 15) -> dict:
+ """Fetch URL contents from a specific site and decode JSON."""
+ try:
+ return json.loads(http_get(site_url, path, timeout=timeout))
+ except json.JSONDecodeError:
+ log.warning("Unable to decode JSON response from %r", path)
+ raise
+
+# ----- Helper: Truncate Filename -----
+def truncate_filename(filename: str, max_length: int = 255) -> str:
+ """
+ Truncates the file name to a maximum length (default 255 characters).
+ It preserves the file extension.
+ """
+ if len(filename) <= max_length:
+ return filename
+ p = Path(filename)
+ stem = p.stem
+ suffix = "".join(p.suffixes)
+ max_stem_length = max_length - len(suffix)
+ if max_stem_length <= 0:
+ return filename[:max_length]
+ truncated_stem = stem[:max_stem_length]
+ return truncated_stem + suffix
+
+# ----- Image / Link Download Helpers -----
+def fix_url(original_url: str) -> str:
+ """Fix scheme-relative URLs by prepending https: if necessary."""
+ if original_url.startswith("//"):
+ fixed = "https:" + original_url
+ log.debug("Converted scheme-relative URL: %s -> %s", original_url, fixed)
+ return fixed
+ return original_url
+
+def download_image(image_url: str, dest_path: Path, timeout: int = 15):
+ """
+ Download an image from image_url and save it to dest_path.
+ If the file already exists, skip downloading.
+ A timeout is specified to avoid hanging indefinitely.
+ """
+ if dest_path.exists():
+ log.debug("Image already downloaded: %s", dest_path)
+ return
+ try:
+ log.info("Downloading image: %s", image_url)
+ with urllib.request.urlopen(fix_url(image_url), timeout=timeout) as response:
+ image_data = response.read()
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
+ dest_path.write_bytes(image_data)
+ log.info("Saved image to %s", dest_path)
+ except Exception as e:
+ log.error("Failed to download image %s: %s", image_url, e)
+
+def process_srcset(srcset_value: str, topic_dir: Path, topic_relative_path: str) -> str:
+ """
+ Process a srcset attribute value, downloading images and returning a rewritten value.
+ Downloads every image referenced regardless of URL content.
+ """
+ entries = srcset_value.split(",")
+ fixed_entries = []
+ for entry in entries:
+ parts = entry.strip().split()
+ if not parts:
+ continue
+ orig_url = parts[0]
+ fixed_url = fix_url(orig_url)
+ parsed = urlparse(fixed_url)
+ image_filename = os.path.basename(parsed.path)
+ if not image_filename:
+ log.warning("Skipping srcset URL with empty filename: %s", fixed_url)
+ continue
+ dest_path = topic_dir / image_filename
+ download_image(fixed_url, dest_path)
+ full_path = os.path.join(topic_relative_path, image_filename).replace(os.sep, '/')
+ if len(parts) > 1:
+ fixed_entries.append(f"{full_path} {parts[1]}")
+ else:
+ fixed_entries.append(f"{full_path}")
+ return ", ".join(fixed_entries)
+
+def is_image_link(url: str) -> bool:
+ """Determine if the URL points to an image by its extension."""
+ image_extensions = (".png", ".jpg", ".jpeg", ".gif", ".webp")
+ parsed = urlparse(url)
+ filename = os.path.basename(parsed.path).lower()
+ return filename.endswith(image_extensions)
+
+def process_html(html_content: str, topic_dir: Path, topic_relative_path: str) -> str:
+ """
+ Process the given HTML: download referenced images and rewrite links.
+ Processes both
(src, srcset) and tags pointing to images.
+ Downloads every image referenced in the HTML.
+ Returns the modified HTML.
+ """
+ soup = BeautifulSoup(html_content, "html.parser")
+
+ # Process
tags.
+ for img in soup.find_all("img"):
+ src = img.get("src")
+ if src:
+ src = fix_url(src)
+ parsed = urlparse(src)
+ image_filename = os.path.basename(parsed.path)
+ if image_filename:
+ dest_path = topic_dir / image_filename
+ download_image(src, dest_path)
+ full_src = os.path.join(topic_relative_path, image_filename).replace(os.sep, '/')
+ img["src"] = full_src
+ else:
+ log.warning("Skipping image with empty filename from src: %s", src)
+ srcset = img.get("srcset")
+ if srcset:
+ new_srcset = process_srcset(srcset, topic_dir, topic_relative_path)
+ img["srcset"] = new_srcset
+
+ # Process tags whose href points to images.
+ for a in soup.find_all("a"):
+ href = a.get("href")
+ if href:
+ fixed_href = fix_url(href)
+ if is_image_link(fixed_href):
+ parsed = urlparse(fixed_href)
+ image_filename = os.path.basename(parsed.path)
+ if image_filename:
+ dest_path = topic_dir / image_filename
+ download_image(fixed_href, dest_path)
+ new_href = os.path.join(topic_relative_path, image_filename).replace(os.sep, '/')
+ a["href"] = new_href
+ else:
+ log.warning("Skipping link with empty filename from href: %s", fixed_href)
+ return str(soup)
+
+def slugify(value: str) -> str:
+ """
+ Normalizes string, removes non-alphanumeric characters, and converts whitespace to hyphens.
+ Useful for constructing filenames.
+ """
+ value = str(value)
+ value = value.strip().lower()
+ value = re.sub(r'[^a-z0-9\s-]', '', value)
+ value = re.sub(r'[\s-]+', '-', value)
+ return value or "untitled"
+
+# ----- Data Models -----
+@dataclass(frozen=True)
+class PostTopic:
+ id: int
+ slug: str
+ title: str
+
+@dataclass(frozen=True)
+class Post:
+ id: int
+ slug: str
+ raw: dict
+
+ def get_created_at(self) -> datetime.datetime:
+ return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00"))
+
+ def save(self, dir: Path):
+ """Save the raw JSON post to disk if not already archived."""
+ idstr = str(self.id).zfill(10)
+ filename = f"{idstr}-{self.raw.get('username', 'anonymous')}-{self.raw.get('topic_slug', 'unknown')}.json"
+ filename = truncate_filename(filename)
+ folder_name = self.get_created_at().strftime('%Y-%m-%B')
+ full_path = dir / folder_name / filename
+
+ if full_path.exists():
+ log.debug("Post %s already saved, skipping", self.id)
+ return
+
+ full_path.parent.mkdir(parents=True, exist_ok=True)
+ log.info("Saving post %s to %s", self.id, full_path)
+ full_path.write_text(json.dumps(self.raw, indent=2), encoding='utf-8')
+
+ def get_topic(self) -> PostTopic:
+ return PostTopic(
+ id=self.raw.get('topic_id', self.id),
+ slug=self.raw.get('topic_slug', self.slug),
+ title=self.raw.get('topic_title', self.raw.get('title', 'No Title')),
+ )
+
+ @classmethod
+ def from_json(cls, j: dict) -> 'Post':
+ return cls(
+ id=j['id'],
+ slug=j.get('topic_slug', 'unknown'),
+ raw=j,
+ )
+
+@dataclass(frozen=True)
+class Topic:
+ id: int
+ slug: str
+ raw: dict
+ markdown: str
+
+ def get_created_at(self) -> datetime.datetime:
+ return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00"))
+
+ def save_rendered(self, dir: Path):
+ """
+ Save the rendered Markdown topic to disk.
+ Filename built from creation date, slug, and id.
+ Truncate the filename if needed.
+ """
+ date_str = str(self.get_created_at().date())
+ filename = f"{date_str}-{self.slug}-id{self.id}.md"
+ filename = truncate_filename(filename)
+ folder_name = self.get_created_at().strftime('%Y-%m-%B')
+ full_path = dir / folder_name / filename
+ full_path.parent.mkdir(parents=True, exist_ok=True)
+ log.info("Saving rendered topic %s to %s", self.id, full_path)
+ rendered_markdown = f"# {self.raw.get('title', 'No Title')}\n\n{self.markdown}"
+ full_path.write_text(rendered_markdown, encoding='utf-8')
+ # Return the relative path from the repository root.
+ return full_path.relative_to(dir.parent)
+
+ @classmethod
+ def from_json(cls, t: dict, markdown: str) -> 'Topic':
+ slug = t.get('slug') or t.get('topic_slug') or "unknown"
+ return cls(
+ id=t.get('id', 0),
+ slug=slug,
+ raw=t,
+ markdown=markdown,
+ )
+
+# ----- New Helper for Rendering Topics with Image Downloading -----
+def render_topic(site_url: str, topic: PostTopic, topics_dir: Path):
+ """
+ Render a single topic to Markdown by:
+ 1. Fetching the topic JSON.
+ 2. Downloading its associated images and rewriting their URLs.
+ 3. Converting processed HTML to Markdown (using html2text).
+ 4. Saving the rendered Markdown document.
+
+ Images are saved to an assets directory relative to the site target directory.
+ Returns a dictionary with topic info for README updating.
+ """
+ try:
+ log.info("Fetching topic %s JSON from %s", topic.id, site_url)
+ topic_data = http_get_json(site_url, f"/t/{topic.id}.json")
+ except Exception as e:
+ log.warning("Failed to fetch topic JSON for topic %s: %s", topic.id, e)
+ return None
+
+ # Define the assets directory in the repository root.
+ assets_dir = topics_dir.parent / "assets" / "images" / f"{topic.id}"
+ assets_dir.mkdir(parents=True, exist_ok=True)
+
+ # Determine the directory where the rendered markdown file will be saved.
+ try:
+ created_at = datetime.datetime.fromisoformat(topic_data['created_at'].replace("Z", "+00:00"))
+ except Exception as e:
+ log.error("Could not parse created_at for topic %s: %s", topic.id, e)
+ created_at = datetime.datetime.now()
+ folder_name = created_at.strftime('%Y-%m-%B')
+ rendered_md_dir = topics_dir / folder_name
+
+ # Compute the relative path from the markdown file's directory to the assets directory.
+ topic_relative_path = os.path.relpath(assets_dir, rendered_md_dir)
+
+ posts = topic_data.get("post_stream", {}).get("posts", [])
+ if not posts:
+ log.error("No posts found for topic %s", topic.id)
+ return None
+
+ converter = html2text.HTML2Text()
+ converter.body_width = 0
+ md_sections = []
+ for post in posts:
+ created = post.get("created_at", "unknown")
+ updated = post.get("updated_at", "unknown")
+ post_number = post.get("post_number", 0)
+ cooked_html = post.get("cooked", "")
+ # Pass the corrected topic_relative_path into process_html()
+ processed_html = process_html(cooked_html, assets_dir, topic_relative_path)
+ post_md = converter.handle(processed_html)
+ header_lines = [
+ f"**ID:** {topic.id}",
+ f"**USERNAME:** {post.get('username', 'unknown')}",
+ f"**POST NUMBER:** {post_number}",
+ f"**CREATED AT:** {created}",
+ f"**UPDATED AT:** {updated}",
+ ]
+ # Join header lines with two newlines so each appears on its own line in GitHub Markdown.
+ header = "\n\n".join(header_lines)
+ section = f"## Post {post_number}\n\n{header}\n\n---\n\n{post_md}"
+ md_sections.append(section)
+ full_md = "\n\n".join(md_sections)
+ topic_title = topic_data.get("title", "No Title")
+ full_md = f"# {topic_title}\n\n" + full_md
+
+ topic_obj = Topic.from_json(topic_data, full_md)
+ saved_relative_path = topic_obj.save_rendered(topics_dir)
+ log.info("Saved rendered topic %s (%s)", topic_obj.id, topic_obj.slug)
+ # Return topic info for README.
+ return {
+ "id": topic_obj.id,
+ "slug": topic_obj.slug,
+ "title": topic_title,
+ "relative_path": str(saved_relative_path)
+ }
+
+# ----- Concurrent Rendering Helper -----
+def render_topics_concurrently(site_url: str, topics: dict, topics_dir: Path, max_workers: int = 8):
+ """
+ Render multiple topics concurrently.
+ Returns a list of rendered topic information dictionaries.
+ """
+ rendered_topics_info = []
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+ futures = {executor.submit(render_topic, site_url, topic, topics_dir): topic for topic in topics.values()}
+ for future in concurrent.futures.as_completed(futures):
+ try:
+ result = future.result()
+ if result:
+ rendered_topics_info.append(result)
+ # Update the README incrementally after each topic is rendered.
+ update_readme_incrementally(topics_dir.parent, result)
+ except Exception as exc:
+ log.error("A topic generated an exception: %s", exc)
+ return rendered_topics_info
+
+def update_metadata(metadata_file: Path, metadata: dict):
+ """Writes the metadata as a JSON file to disk."""
+ log.debug("Updating metadata: %s", metadata)
+ metadata_file.write_text(json.dumps(metadata, indent=2), encoding='utf-8')
+
+# A helper pattern to match a TOC line (i.e. a line with the topic entry and its id)
+TOC_LINE_PATTERN = re.compile(
+ r"- $(?P.+?)$(?P.+?)$\s*")
+
+# ----- README Update Helpers -----
+def read_existing_readme(repo_root: Path):
+ """
+ Read the existing README.md from repo_root and return a dictionary of topics.
+ The keys will be the topic IDs (as integers) and the values as the topic dict.
+ If the file doesn't exist, return an empty dict.
+ """
+ readme_path = repo_root / "README.md"
+ existing_topics = {}
+ if readme_path.exists():
+ try:
+ content = readme_path.read_text(encoding='utf-8')
+ # Expecting lines like: - [Topic Title](relative_path)
+ pattern = TOC_LINE_PATTERN
+ for line in content.splitlines():
+ match = pattern.match(line)
+ if match:
+ topic_id = int(match.group("id"))
+ existing_topics[topic_id] = {
+ "id": topic_id,
+ "title": match.group("title"),
+ "relative_path": match.group("relative_path")
+ }
+ except Exception as e:
+ log.error("Failed to parse existing README.md: %s", e)
+ return existing_topics
+
+def update_readme_incrementally(repo_root: Path, new_topic: dict):
+ """
+ Update or create README.md in repo_root by merging the new topic into the existing list.
+ If the topic already exists, report that. Otherwise, append the new topic to the TOC.
+ """
+ topic_id = new_topic["id"]
+ existing_topics = read_existing_readme(repo_root)
+ if topic_id in existing_topics:
+ log.debug("Topic with id %s already exists in README.md", topic_id)
+ return
+ existing_topics[topic_id] = new_topic
+ append_to_readme(repo_root, new_topic)
+
+def append_to_readme(repo_root: Path, new_topic: dict):
+ """
+ Append a new topic to the existing README.md table-of-contents (TOC).
+ If README.md doesn't exist, create it with a header and the new topic.
+ """
+ readme_path = repo_root / "README.md"
+ toc_header = ["# Archived Discourse Topics", "", "## Table of Contents", ""]
+ new_toc_line = f"- [{new_topic['title']}]({new_topic['relative_path']}) "
+
+ if readme_path.exists():
+ try:
+ # Read the existing content
+ content = readme_path.read_text(encoding="utf-8")
+ lines = content.splitlines()
+ # Check if the file already has a TOC header by looking for the header marker.
+ try:
+ toc_start = lines.index("## Table of Contents")
+ # Find the blank line after the TOC header if exists
+ insertion_index = toc_start + 1
+ # Advance until we find the first non-TOC line or reach the end.
+ while (
+ insertion_index < len(lines)
+ and TOC_LINE_PATTERN.match(lines[insertion_index].strip())
+ ):
+ insertion_index += 1
+ # Now, insert our new entry just before the first non-TOC line.
+ lines.insert(insertion_index, new_toc_line)
+ new_content = "\n".join(lines)
+ except ValueError:
+ # "## Table of Contents" not found, so we create a new TOC block at the top
+ new_content = "\n".join(toc_header + [new_toc_line] + [""] + lines)
+ except Exception as e:
+ log.error("Failed to read existing README.md: %s", e)
+ # In case of error, default to creating a new README.md with header and new topic
+ new_content = "\n".join(toc_header + [new_toc_line])
+ else:
+ # README.md doesn't exist, create a new one with a standard header and the new TOC entry
+ new_content = "\n".join(toc_header + [new_toc_line])
+
+ try:
+ readme_path.write_text(new_content, encoding="utf-8")
+ log.info("Updated README.md at %s", readme_path)
+ except Exception as e:
+ log.error("Failed to write README.md: %s", e)
+
+def write_readme(site_target_dir: Path, topics: dict):
+ """
+ Given a dictionary of topics, write out the full README.md at the site target directory.
+ """
+ readme_path = site_target_dir / "README.md"
+ lines = ["# Archived Discourse Topics", "", "## Table of Contents", ""]
+ sorted_topics = sorted(topics.values(), key=lambda t: t["id"])
+ for topic in sorted_topics:
+ line = f"- [{topic['title']}]({topic['relative_path']}) "
+ lines.append(line)
+ content = "\n".join(lines)
+ try:
+ readme_path.write_text(content, encoding="utf-8")
+ log.info("Finalized README.md updated at %s", readme_path)
+ except Exception as e:
+ log.error("Failed to write final README.md: %s", e)
+
+# ----- Site Processing Function -----
+def process_site(site_url: str, base_target_dir: Path):
+ """
+ Archive posts and render topics for a single site.
+ Each site gets its own subdirectory (named for its hostname) inside the base target directory,
+ and its own metadata file.
+
+ The README.md is updated incrementally after each topic is rendered.
+ """
+ parsed = urlparse(site_url)
+ site_name = parsed.hostname or site_url.replace("https://", "").replace("http://", "").split('/')[0]
+ log.info("Processing site: %s", site_url)
+ site_target_dir = base_target_dir / site_name
+ posts_dir = site_target_dir / 'posts'
+ topics_dir = site_target_dir / 'rendered-topics'
+ posts_dir.mkdir(parents=True, exist_ok=True)
+ topics_dir.mkdir(parents=True, exist_ok=True)
+ metadata_file = site_target_dir / '.metadata.json'
+
+ # Load stored metadata if exists.
+ metadata = {}
+ archived_post_ids = set()
+ if metadata_file.exists():
+ try:
+ metadata = json.loads(metadata_file.read_text())
+ if "archived_post_ids" in metadata:
+ archived_post_ids = set(int(x) for x in metadata.get('archived_post_ids', []))
+ except Exception as e:
+ log.error("Failed to read/parse metadata file for %s: %s", site_url, e)
+
+ posts_json = http_get_json(site_url, '/posts.json')
+ posts = posts_json.get('latest_posts', [])
+ last_id = None
+ should_stop = False
+
+ # List to accumulate info for final README generation.
+ rendered_topics_overall = []
+
+ while posts:
+ log.info("Processing %d posts for %s", len(posts), site_url)
+ topics_to_render = {} # Unique topics in this batch.
+ for json_post in posts:
+ try:
+ post = Post.from_json(json_post)
+ except Exception as e:
+ log.warning("Failed to deserialize post %s: %s", json_post, e)
+ continue
+ if post.id in archived_post_ids:
+ log.debug("Post %s already archived, skipping", post.id)
+ continue
+ post.save(posts_dir)
+ archived_post_ids.add(post.id)
+ last_id = post.id
+ topic = post.get_topic()
+ topics_to_render[topic.id] = topic
+ # Update metadata right away so that already processed posts won't be lost on interrupt.
+ metadata['archived_post_ids'] = sorted(archived_post_ids)
+ update_metadata(metadata_file, metadata)
+ if topics_to_render:
+ log.info("Rendering %d topics concurrently for %s.", len(topics_to_render), site_url)
+ rendered = render_topics_concurrently(site_url, topics_to_render, topics_dir, max_workers=8)
+ rendered_topics_overall.extend(rendered)
+ if should_stop:
+ log.info("Stopping pagination loop based on sync date for %s.", site_url)
+ break
+ if last_id is None or last_id <= 1:
+ log.info("No valid last_id found for %s. Ending pagination loop.", site_url)
+ break
+ time.sleep(5)
+ posts = http_get_json(site_url, f'/posts.json?before={last_id - 1}').get('latest_posts', [])
+ while not posts and last_id and last_id >= 0:
+ last_id -= 49
+ posts = http_get_json(site_url, f'/posts.json?before={last_id}').get('latest_posts', [])
+ time.sleep(1)
+
+ # Final merge/update of README from all rendered topics.
+ if rendered_topics_overall:
+ existing = read_existing_readme(site_target_dir)
+ for new_topic in rendered_topics_overall:
+ if new_topic["id"] not in existing:
+ existing[new_topic["id"]] = new_topic
+ write_readme(site_target_dir, existing)
+ else:
+ log.info("No topics rendered for %s; skipping final README.md generation.", site_url)
+
+def main() -> None:
+ parameters = args()
+ base_target_dir = parameters.target_dir
+ if not isinstance(base_target_dir, Path):
+ base_target_dir = Path(base_target_dir)
+ base_target_dir.mkdir(parents=True, exist_ok=True)
+ sites = parse_sites(parameters.urls)
+ if not sites:
+ log.error("No valid sites provided. Exiting.")
+ sys.exit(1)
+ for site_url in sites:
+ process_site(site_url, base_target_dir)
+
+if __name__ == "__main__":
+ main()