Handling special reddit feeds

2025-04-13 09:42:01 +00:00 · 2023-12-25 08:42:48 -05:00 · 2023-12-25 08:42:48 -05:00 · 8ade315309
commit 8ade315309
parent a04587f8fa
2 changed files with 46 additions and 8 deletions
--- a/utils/feed_fetcher.py
+++ b/utils/feed_fetcher.py
@ -52,6 +52,7 @@ from utils import json_functions as json
 from celery.exceptions import SoftTimeLimitExceeded
 from utils.twitter_fetcher import TwitterFetcher
 from utils.facebook_fetcher import FacebookFetcher
+from utils.reddit_fetcher import RedditFetcher
 from utils.json_fetcher import JSONFetcher

 # from utils.feed_functions import mail_feed_error_to_admin
@ -152,7 +153,7 @@ class FetchFeed:
                )
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(facebook_feed)
-        elif re.match(r'(.*?)reddit.com/\w+/?$', qurl(address, remove=['_'])):
+        elif re.match(r'(.*?)reddit.com/(.*?)$', qurl(address, remove=['_'])):
            reddit_feed = self.fetch_reddit()
            if not reddit_feed:
                logging.debug(
--- a/utils/reddit_fetcher.py
+++ b/utils/reddit_fetcher.py
@ -25,11 +25,19 @@ class RedditFetcher:
        return self._api
    
    def fetch(self):
-        subreddit_name = self.extract_subreddit_name()
-        if not subreddit_name: 
-            return
-
-        subreddit = self.fetch_subreddit(subreddit_name)
+        # Common subreddits handled differently
+        # Home page
+        if self.feed.feed_address == "https://reddit.com/.rss":
+            subreddit = self.fetch_subreddit("popular")
+        elif self.feed.feed_address == "https://reddit.com/r/all.rss":
+            subreddit = self.fetch_subreddit("all")
+        elif self.feed.feed_address == "https://reddit.com/r/popular.rss":
+            subreddit = self.fetch_subreddit("popular")
+        else:
+            subreddit_name = self.extract_subreddit_name()
+            if not subreddit_name: 
+                return
+            subreddit = self.fetch_subreddit(subreddit_name)

        data = {}
        data['title'] = subreddit.title
@ -91,7 +99,8 @@ class RedditFetcher:
        story_data = {}
        story_data['title'] = submission.title
        story_data['link'] = submission.url
-        story_data['description'] = submission.selftext
+        story_data['description'] = self.process_story_text(submission)
+        story_data['author_name'] = submission.author.name
        story_data['categories'] = []
        story_data['unique_id'] = "reddit_post:%s" % submission.id
        story_data['pubdate'] = datetime.datetime.fromtimestamp(submission.created_utc)
@ -101,12 +110,40 @@ class RedditFetcher:
        story_data = {}
        story_data['title'] = submission.title
        story_data['link'] = submission.url
-        story_data['description'] = submission.selftext
+        story_data['description'] = self.process_story_text(submission)
+        story_data["author_name"] = submission.author.name
        story_data['categories'] = []
        story_data['unique_id'] = "reddit_post:%s" % submission.id
        story_data['pubdate'] = datetime.datetime.fromtimestamp(submission.created_utc)
        return story_data

+    def process_story_text(self, submission):
+        text = submission.selftext
+
+        # Wrap blocks with four spaces in <pre> tags
+        text = re.sub(r'(^\s{4})(.*\n)', r'<pre>\2</pre>', text, flags=re.M)
+        # Wrap links in <a> tags
+        text = re.sub(r'(https?://[^\s]+)', r'<a href="\1">\1</a>', text, flags=re.M)
+        # Wrap image links in <img> tags
+        text = re.sub(r'(https?://[^\s]+\.(jpg|jpeg|gif|png))', r'<img src="\1" />', text, flags=re.M)
+        # Wrap bold text in <b> tags
+        text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text)
+        # Wrap italics text in <i> tags
+        text = re.sub(r'\*(.*?)\*', r'<i>\1</i>', text)
+        # Replace newlines with <br> tags
+        
+        text = text.replace('\r\n', '\n')
+        text = text.replace('\r', '\n')
+        text = linebreaks(text)
+
+        # Add author and [link] [comments] footer
+        permalink = submission.permalink
+        if submission.is_self:
+            permalink = submission.url
+        text = f'{text}\n\n<p>Posted by <a href="https://reddit.com/u/{submission.author.name}">{submission.author.name}</a><br><a href="{permalink}">[link]</a> <a href="{permalink}">[comments]</a></p>'
+        
+        return text
+    
    def favicon_url(self, subreddit=None):
        if not subreddit:
            subreddit_name = self.extract_subreddit_name()