From 3a6f3dd3eb869da371df8c77414086f7b3083950 Mon Sep 17 00:00:00 2001 From: git-bruh Date: Mon, 13 Feb 2023 14:18:34 +0530 Subject: [PATCH 1/6] squash --- job_thread/main.py | 133 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 job_thread/main.py diff --git a/job_thread/main.py b/job_thread/main.py new file mode 100644 index 0000000..23eb34f --- /dev/null +++ b/job_thread/main.py @@ -0,0 +1,133 @@ +import logging +import sqlite3 +from os import environ +from time import time +import praw + +SECONDS_IN_WEEK = 60 * 60 * 24 * 7 + + +class Config: + DB_PATH = "posts.db" + SUBREDDIT = "developersindia" + + POST_FLAIR = "Hiring" + POST_TITLE = "Don't Miss Out on These Job Opportunities | Weekly Job Openings Thread" + POST_TEXT = """\ +This thread has all the latest job openings that haven't been posted on previous weekly threads. + +If you have a job opening that you'd like to share with the community, you can post it using this link:- https://developersindia.in/post-a-job/ + +For all the available job openings, check out the Job Board at:- https://developersindia.in/job-board/ + +Stay tuned for updates on the latest job openings, and apply for the ones that interest you. Wishing you the best of luck in your job search!\ +""" + + CLIENT_ID = environ["REDDIT_CLIENT_ID"] + CLIENT_SECRET = environ["REDDIT_CLIENT_SECRET"] + REDDIT_PASSWORD = environ["REDDIT_PASSWORD"] + USERNAME = environ["REDDIT_USERNAME"] + USER_AGENT = f"u/{USERNAME} Job Board" + + +class Database: + def __init__(self, db_path): + self.conn = sqlite3.connect(db_path) + self.conn.row_factory = sqlite3.Row + + self.cur = self.conn.cursor() + + self._create() + + def _create(self): + with self.conn: + self.cur.execute( + "CREATE TABLE IF NOT EXISTS Posts" + "(post_id TEXT PRIMARY KEY, time INTEGER NOT NULL DEFAULT(UNIXEPOCH()))" + ) + + def get_latest_post(self): + self.cur.execute("SELECT post_id, time from Posts ORDER BY time DESC") + + if (result := self.cur.fetchone()) is not None: + return dict(result) + + def insert_post(self, post_id: str, timestamp: int): + with self.conn: + self.cur.execute( + "INSERT INTO Posts (post_id, time) VALUES ((?), (?))", + (post_id, timestamp), + ) + + +def should_create_new_post(latest_post): + if latest_post is not None: + return (time() - latest_post["time"]) >= SECONDS_IN_WEEK + + return True + + +def create_job_post(subreddit): + # https://old.reddit.com/r/redditdev/comments/ovte4q/praw_flair_a_post/h7doqmd/?context=3 + flair = next( + filter( + lambda flair: flair["flair_text"] == Config.POST_FLAIR, + subreddit.flair.link_templates.user_selectable(), + ) + ) + + submission = subreddit.submit( + Config.POST_TITLE, + selftext=Config.POST_TEXT, + flair_id=flair["flair_template_id"], + ) + submission.mod.sticky() + + return submission + + +def main(): + logging.root.setLevel(logging.INFO) + + db = Database(Config.DB_PATH) + reddit = praw.Reddit( + client_id=Config.CLIENT_ID, + client_secret=Config.CLIENT_SECRET, + password=Config.REDDIT_PASSWORD, + user_agent=Config.USER_AGENT, + username=Config.USERNAME, + ) + + subreddit = reddit.subreddit(Config.SUBREDDIT) + + maybe_old_post = db.get_latest_post() + + logging.info(f"Latest post in database {maybe_old_post}") + + if should_create_new_post(maybe_old_post): + # Un-stick/pin the old post + if maybe_old_post is not None: + logging.info(f"Un-pinning old post") + + try: + reddit.submission(maybe_old_post["post_id"]).mod.sticky( + state=False + ) + except Exception: + logging.warning(f"Failed to un-pin post!", exc_info=True) + + new_submission = create_job_post(subreddit) + + logging.info( + f"Created new post {new_submission.id} at {new_submission.created_utc}" + ) + + db.insert_post(new_submission.id, new_submission.created_utc) + + submission = reddit.submission(db.get_latest_post()["post_id"]) + + logging.info(f"Fetched latest submission {submission.id}") + + +if __name__ == "__main__": + main() From cb2fee093c3a3bdc6c8dc76565516cbebf315ca1 Mon Sep 17 00:00:00 2001 From: git-bruh Date: Tue, 14 Feb 2023 20:56:16 +0530 Subject: [PATCH 2/6] feed --- job_thread/main.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/job_thread/main.py b/job_thread/main.py index 23eb34f..e5bc6ee 100644 --- a/job_thread/main.py +++ b/job_thread/main.py @@ -1,7 +1,10 @@ import logging import sqlite3 +from dataclasses import dataclass from os import environ from time import time + +import feedparser import praw SECONDS_IN_WEEK = 60 * 60 * 24 * 7 @@ -28,6 +31,7 @@ Stay tuned for updates on the latest job openings, and apply for the ones that i REDDIT_PASSWORD = environ["REDDIT_PASSWORD"] USERNAME = environ["REDDIT_USERNAME"] USER_AGENT = f"u/{USERNAME} Job Board" + FEED_URL = "https://developersindia.in/?feed=job_feed" class Database: @@ -60,6 +64,36 @@ class Database: ) +@dataclass +class Job: + post_id: str # Used for deduplication + title: str + company_name: str + location: str + job_type: str + salary: str + summary: str + permalink: str + + +def get_job_entries(feed_url): + entries = feedparser.parse(feed_url).entries + + return [ + Job( + post_id=entry["post-id"], + title=entry["title"], + company_name=entry["job_listing_company"], + location=entry.get("job_listing_location", "N/A"), + job_type=entry["job_listing_job_type"], + salary=entry.get("job_listing_salary", "N/A"), + summary=entry["summary"], + permalink=entry["link"], + ) + for entry in entries + ] + + def should_create_new_post(latest_post): if latest_post is not None: return (time() - latest_post["time"]) >= SECONDS_IN_WEEK From 162c40dadbad56456aad0371e64da3da422b7654 Mon Sep 17 00:00:00 2001 From: git-bruh Date: Wed, 15 Feb 2023 14:34:56 +0530 Subject: [PATCH 3/6] json --- job_thread/main.py | 114 +++++++++++++++++++++++++++++++-------------- 1 file changed, 79 insertions(+), 35 deletions(-) diff --git a/job_thread/main.py b/job_thread/main.py index e5bc6ee..2e3ccf4 100644 --- a/job_thread/main.py +++ b/job_thread/main.py @@ -1,7 +1,8 @@ +import json import logging -import sqlite3 +from copy import deepcopy from dataclasses import dataclass -from os import environ +from os import environ, fsync from time import time import feedparser @@ -11,7 +12,7 @@ SECONDS_IN_WEEK = 60 * 60 * 24 * 7 class Config: - DB_PATH = "posts.db" + DB_PATH = "db.json" SUBREDDIT = "developersindia" POST_FLAIR = "Hiring" @@ -34,34 +35,79 @@ Stay tuned for updates on the latest job openings, and apply for the ones that i FEED_URL = "https://developersindia.in/?feed=job_feed" -class Database: - def __init__(self, db_path): - self.conn = sqlite3.connect(db_path) - self.conn.row_factory = sqlite3.Row +@dataclass +class Post: + post_id: str + epoch: int - self.cur = self.conn.cursor() + +def dict_raise_or_set(d, key, value): + if d.get(key) is not None: + raise ValueError(f"Key {key} already present in dictionary") + + d[key] = value + + +class Database: + POSTS = "postid_epoch" + COMMENTS = "jobid_commentid" + + def __init__(self, db_path): + try: + self._fp = open(db_path, "r+") + self._db = json.loads(self._fp.read() or "{}") + except FileNotFoundError: + self._fp = open(db_path, "w") + self._db = {} + + self._copy = None self._create() + def __enter__(self): + self._copy = deepcopy(self._db) + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # We re-write the DB on each transaction as we cannot guarantee "atomicity" + # when dealing with external APIs. Eg, we can locally roll back a transaction + # inserting multiple posts into the DB in case of an exception, but the + # posts obviously won't be deleted on the reddit side. So we write + # as much as possible incrementally to prevent losing track of already created + # posts, preventing their re-creation server side in case of a crash. + if exc_type: + self._db = self._copy + # If a change was actually made + elif self._db != self._copy: + self._fp.seek(0) + self._fp.write(json.dumps(self._db, indent=4)) + self._fp.truncate() # Trim the file to the desired size + fsync(self._fp) + + self._copy = None + def _create(self): - with self.conn: - self.cur.execute( - "CREATE TABLE IF NOT EXISTS Posts" - "(post_id TEXT PRIMARY KEY, time INTEGER NOT NULL DEFAULT(UNIXEPOCH()))" - ) + with self: + self._db.setdefault(self.POSTS, {}) + self._db.setdefault(self.COMMENTS, {}) - def get_latest_post(self): - self.cur.execute("SELECT post_id, time from Posts ORDER BY time DESC") + def get_latest_post(self) -> Post | None: + # {"id": 1234, "id2": "5678"} -> ("id2", "5678") (Descending) + try: + result = sorted( + self._db[self.POSTS].items(), + key=lambda item: item[1], + reverse=True, + )[0] + except IndexError: + return None - if (result := self.cur.fetchone()) is not None: - return dict(result) + return Post(post_id=result[0], epoch=result[1]) - def insert_post(self, post_id: str, timestamp: int): - with self.conn: - self.cur.execute( - "INSERT INTO Posts (post_id, time) VALUES ((?), (?))", - (post_id, timestamp), - ) + def insert_post(self, post: Post): + with self: + dict_raise_or_set(self._db[self.POSTS], post.post_id, post.epoch) @dataclass @@ -94,14 +140,14 @@ def get_job_entries(feed_url): ] -def should_create_new_post(latest_post): +def should_create_new_post(latest_post: Post) -> bool: if latest_post is not None: - return (time() - latest_post["time"]) >= SECONDS_IN_WEEK + return (time() - latest_post.epoch) >= SECONDS_IN_WEEK return True -def create_job_post(subreddit): +def create_job_post(subreddit) -> Post: # https://old.reddit.com/r/redditdev/comments/ovte4q/praw_flair_a_post/h7doqmd/?context=3 flair = next( filter( @@ -117,7 +163,7 @@ def create_job_post(subreddit): ) submission.mod.sticky() - return submission + return Post(post_id=submission.id, epoch=submission.created_utc) def main(): @@ -141,24 +187,22 @@ def main(): if should_create_new_post(maybe_old_post): # Un-stick/pin the old post if maybe_old_post is not None: - logging.info(f"Un-pinning old post") + logging.info(f"Un-pinning old post {maybe_old_post}") try: - reddit.submission(maybe_old_post["post_id"]).mod.sticky( + reddit.submission(maybe_old_post.post_id).mod.sticky( state=False ) except Exception: logging.warning(f"Failed to un-pin post!", exc_info=True) - new_submission = create_job_post(subreddit) + new_post = create_job_post(subreddit) - logging.info( - f"Created new post {new_submission.id} at {new_submission.created_utc}" - ) + logging.info(f"Created new post {new_post}") - db.insert_post(new_submission.id, new_submission.created_utc) + db.insert_post(new_post) - submission = reddit.submission(db.get_latest_post()["post_id"]) + submission = reddit.submission(db.get_latest_post().post_id) logging.info(f"Fetched latest submission {submission.id}") From 4d029e7dfb30d82d14ce57a8a827592c0e26514f Mon Sep 17 00:00:00 2001 From: git-bruh Date: Wed, 15 Feb 2023 15:08:33 +0530 Subject: [PATCH 4/6] rss --- job_thread/main.py | 47 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/job_thread/main.py b/job_thread/main.py index 2e3ccf4..5440139 100644 --- a/job_thread/main.py +++ b/job_thread/main.py @@ -48,7 +48,7 @@ def dict_raise_or_set(d, key, value): d[key] = value -class Database: +class DB: POSTS = "postid_epoch" COMMENTS = "jobid_commentid" @@ -89,14 +89,14 @@ class Database: def _create(self): with self: - self._db.setdefault(self.POSTS, {}) - self._db.setdefault(self.COMMENTS, {}) + self._db.setdefault(DB.POSTS, {}) + self._db.setdefault(DB.COMMENTS, {}) def get_latest_post(self) -> Post | None: # {"id": 1234, "id2": "5678"} -> ("id2", "5678") (Descending) try: result = sorted( - self._db[self.POSTS].items(), + self._db[DB.POSTS].items(), key=lambda item: item[1], reverse=True, )[0] @@ -107,7 +107,14 @@ class Database: def insert_post(self, post: Post): with self: - dict_raise_or_set(self._db[self.POSTS], post.post_id, post.epoch) + dict_raise_or_set(self._db[DB.POSTS], post.post_id, post.epoch) + + def insert_comment(self, feed_job_id: str, comment_id: str): + with self: + dict_raise_or_set(self._db[DB.COMMENTS], feed_job_id, comment_id) + + def is_job_posted(self, feed_job_id: str): + return self._db[DB.COMMENTS].get(feed_job_id) is not None @dataclass @@ -169,7 +176,7 @@ def create_job_post(subreddit) -> Post: def main(): logging.root.setLevel(logging.INFO) - db = Database(Config.DB_PATH) + db = DB(Config.DB_PATH) reddit = praw.Reddit( client_id=Config.CLIENT_ID, client_secret=Config.CLIENT_SECRET, @@ -206,6 +213,34 @@ def main(): logging.info(f"Fetched latest submission {submission.id}") + for job in get_job_entries(Config.FEED_URL): + if db.is_job_posted(job.post_id): + logging.warning( + f"Ignoring already posted job with post ID {job.post_id}" + ) + continue + + comment_text = f"""\ +[**{job.title}** - {job.company_name}]({job.permalink}) + +**Salary:** {job.salary} + +**Location:** {job.location} + +**Job Type:** {job.job_type} + +### Summary + +{job.summary}\ +""" + + comment = submission.reply(comment_text) + db.insert_comment(job.post_id, comment.id) + + logging.info( + f"Posted job with post ID {job.post_id} as reddit comment {comment.id}" + ) + if __name__ == "__main__": main() From 51c33b291349941eccf8ee997bbd203bc41130eb Mon Sep 17 00:00:00 2001 From: git-bruh Date: Wed, 15 Feb 2023 15:34:30 +0530 Subject: [PATCH 5/6] strip html --- job_thread/main.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/job_thread/main.py b/job_thread/main.py index 5440139..c5dbae2 100644 --- a/job_thread/main.py +++ b/job_thread/main.py @@ -1,5 +1,6 @@ import json import logging +import re from copy import deepcopy from dataclasses import dataclass from os import environ, fsync @@ -129,6 +130,10 @@ class Job: permalink: str +def strip_html(text): + return re.sub("<[^<]+?>", "", text) + + def get_job_entries(feed_url): entries = feedparser.parse(feed_url).entries @@ -140,7 +145,7 @@ def get_job_entries(feed_url): location=entry.get("job_listing_location", "N/A"), job_type=entry["job_listing_job_type"], salary=entry.get("job_listing_salary", "N/A"), - summary=entry["summary"], + summary=strip_html(entry["summary"]), permalink=entry["link"], ) for entry in entries From 5eab8bad2d117762103593eac7c68b7740cd18f2 Mon Sep 17 00:00:00 2001 From: git-bruh Date: Fri, 10 Mar 2023 12:46:01 +0530 Subject: [PATCH 6/6] strftime --- job_thread/main.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/job_thread/main.py b/job_thread/main.py index c5dbae2..03dd6f4 100644 --- a/job_thread/main.py +++ b/job_thread/main.py @@ -4,20 +4,22 @@ import re from copy import deepcopy from dataclasses import dataclass from os import environ, fsync -from time import time +from time import strftime, time import feedparser import praw SECONDS_IN_WEEK = 60 * 60 * 24 * 7 +# Date Month, Year +STRFTIME_FORMAT = "%d %B, %Y" class Config: DB_PATH = "db.json" SUBREDDIT = "developersindia" POST_FLAIR = "Hiring" - POST_TITLE = "Don't Miss Out on These Job Opportunities | Weekly Job Openings Thread" + POST_TITLE = f"Don't Miss Out on These Job Opportunities | Weekly Job Openings Thread | {strftime(STRFTIME_FORMAT)}" POST_TEXT = """\ This thread has all the latest job openings that haven't been posted on previous weekly threads.