Sending all of a user's feeds through the archive fetcher to fill out their backfill, then email them the new stories.

2025-09-18 21:50:56 +00:00 · 2022-04-18 13:29:13 -04:00 · 2022-04-18 13:29:13 -04:00 · ac593494db
commit ac593494db
parent 43695b8cd4
6 changed files with 144 additions and 19 deletions
--- a/apps/profile/models.py
+++ b/apps/profile/models.py
@ -265,9 +265,7 @@ class Profile(models.Model):
        return True
    
    def activate_archive(self, never_expire=False):
-        from apps.profile.tasks import EmailNewPremiumArchive
-        
-        EmailNewPremiumArchive.delay(user_id=self.user.pk)
+        UserSubscription.schedule_fetch_archive_feeds_for_user(self.user.pk)
        
        was_premium = self.is_premium
        was_archive = self.is_archive
@ -289,6 +287,8 @@ class Profile(models.Model):
                except (IntegrityError, Feed.DoesNotExist):
                    pass
    
+        # Count subscribers to turn on archive_subscribers counts, then show that count to users
+        # on the paypal_archive_return page.
        try:
            scheduled_feeds = [sub.feed.pk for sub in subs]
        except Feed.DoesNotExist:
@ -296,7 +296,7 @@ class Profile(models.Model):
        logging.user(self.user, "~SN~FMTasking the scheduling immediate premium setup of ~SB%s~SN feeds..." % 
                     len(scheduled_feeds))
        SchedulePremiumSetup.apply_async(kwargs=dict(feed_ids=scheduled_feeds))
-    
+
        UserSubscription.queue_new_feeds(self.user)
        
        self.setup_premium_history()
@ -1249,8 +1249,8 @@ class Profile(models.Model):
        
        logging.user(self.user, "~BB~FM~SBSending email for new premium: %s" % self.user.email)
    
-    def send_new_premium_archive_email(self, force=False):
-        if not self.user.email or not self.send_emails:
+    def send_new_premium_archive_email(self, new_story_count, total_story_count, force=False):
+        if not self.user.email:
            return
        
        params = dict(receiver_user_id=self.user.pk, email_type='new_premium_archive')
@ -1265,7 +1265,7 @@ class Profile(models.Model):
        user    = self.user
        text    = render_to_string('mail/email_new_premium_archive.txt', locals())
        html    = render_to_string('mail/email_new_premium_archive.xhtml', locals())
-        subject = "Thank you for subscribing to NewsBlur Premium Archive!"
+        subject = f"Your NewsBlur Premium Archive subscription now holds {total_story_count:,} stories"
        msg     = EmailMultiAlternatives(subject, text, 
                                         from_email='NewsBlur <%s>' % settings.HELLO_EMAIL,
                                         to=['%s <%s>' % (user, user.email)])
--- a/apps/profile/tasks.py
+++ b/apps/profile/tasks.py
@ -15,10 +15,30 @@ def EmailNewPremium(user_id):
    user_profile = Profile.objects.get(user__pk=user_id)
    user_profile.send_new_premium_email()

-@app.task(name="email-new-premium-archive")
-def EmailNewPremiumArchive(user_id):
+@app.task()
+def FetchArchiveFeedsForUser(user_id):
+    subs = UserSubscription.objects.filter(user=user_id)
    user_profile = Profile.objects.get(user__pk=user_id)
-    user_profile.send_new_premium_archive_email()
+    logging.user(user_profile.user, f"~FCBeginning archive feed fetches for ~SB~FG{subs.count()} feeds~SN...")
+
+    UserSubscription.fetch_archive_feeds_for_user(user_id)
+
+@app.task()
+def FetchArchiveFeedsChunk(user_id, feed_ids):
+    logging.debug(" ---> Fetching archive stories: %s for %s" % (feed_ids, user_id))
+    UserSubscription.fetch_archive_feeds_chunk(user_id, feed_ids)
+
+@app.task()
+def FinishFetchArchiveFeeds(results, user_id, start_time, starting_story_count):
+    logging.debug(" ---> Fetching archive stories finished for %s" % (user_id))
+    ending_story_count = UserSubscription.finish_fetch_archive_feeds(user_id, start_time)
+    new_story_count = ending_story_count - starting_story_count
+    subs = UserSubscription.objects.filter(user=user_id)
+    
+    user_profile = Profile.objects.get(user__pk=user_id)
+    logging.user(user_profile.user, f"~FCFinished archive feed fetches for ~SB~FG{subs.count()} feeds~FC~SN: ~FG~SB{new_story_count} new~SB~FC, ~FG{ending_story_count} total")
+
+    user_profile.send_new_premium_archive_email(new_story_count, ending_story_count)

@app.task(name="email-new-premium-pro")
 def EmailNewPremiumPro(user_id):
--- a/apps/profile/views.py
+++ b/apps/profile/views.py
@ -380,15 +380,16 @@ def profile_is_premium_archive(request):

    subs = UserSubscription.objects.filter(user=request.user)
    total_subs = subs.count()
-    activated_subs = subs.filter(active=True).count()
+    activated_subs = subs.filter(feed__archive_subscribers__gte=1).count()
    
    if retries >= 30:
        code = -1
        if not request.user.profile.is_premium:
-            subject = "Premium activation failed: %s (%s/%s)" % (request.user, activated_subs, total_subs)
+            subject = "Premium archive activation failed: %s (%s/%s)" % (request.user, activated_subs, total_subs)
            message = """User: %s (%s) -- Email: %s""" % (request.user.username, request.user.pk, request.user.email)
            mail_admins(subject, message)
            request.user.profile.is_premium = True
+            request.user.profile.is_premium_archive = True
            request.user.profile.save()
        
    return {
--- a/apps/reader/models.py
+++ b/apps/reader/models.py
@ -3,6 +3,7 @@ import time
 import re
 import redis
 import pymongo
+import celery
 import mongoengine as mongo
 from operator import itemgetter
 from pprint import pprint
@ -573,7 +574,88 @@ class UserSubscription(models.Model):
        if stale_feeds:
            stale_feeds = list(set([f.feed_id for f in stale_feeds]))
            cls.queue_new_feeds(user, new_feeds=stale_feeds)
+
+    @classmethod
+    def schedule_fetch_archive_feeds_for_user(cls, user_id):
+        from apps.profile.tasks import FetchArchiveFeedsForUser
+        FetchArchiveFeedsForUser.apply_async(kwargs=dict(user_id=user_id), 
+                                             queue='search_indexer')
+        
+    # Should be run as a background task
+    @classmethod
+    def fetch_archive_feeds_for_user(cls, user_id):
+        from apps.profile.tasks import FetchArchiveFeedsChunk, FinishFetchArchiveFeeds
+
+        start_time = time.time()
+        user = User.objects.get(pk=user_id)
+        r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL)
+        r.publish(user.username, 'fetch_archive:start')
+
+        subscriptions = UserSubscription.objects.filter(user=user).only('feed')
+        total = subscriptions.count()
+
+        
+        feed_ids = []
+        starting_story_count = 0
+        for sub in subscriptions:
+            try:
+                feed_ids.append(sub.feed.pk)
+            except Feed.DoesNotExist:
+                continue
+            starting_story_count += MStory.objects(story_feed_id=sub.feed.pk).count()
+        
+        feed_id_chunks = [c for c in chunks(feed_ids, 6)]
+        logging.user(user, "~FCFetching archive stories from ~SB%s feeds~SN in %s chunks..." %
+                     (total, len(feed_id_chunks)))
+        
+        search_chunks = [FetchArchiveFeedsChunk.s(feed_ids=feed_id_chunk,
+                                                  user_id=user_id
+                                                  ).set(queue='search_indexer')
+                         for feed_id_chunk in feed_id_chunks]
+        callback = FinishFetchArchiveFeeds.s(user_id=user_id,
+                                             start_time=start_time,
+                                             starting_story_count=starting_story_count).set(queue='search_indexer')
+        celery.chord(search_chunks)(callback)
+
+    @classmethod
+    def fetch_archive_feeds_chunk(cls, user_id, feed_ids):
+        from apps.rss_feeds.models import Feed
+        r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL)
+        user = User.objects.get(pk=user_id)
+
+        logging.user(user, "~FCFetching archive stories from %s feeds..." % len(feed_ids))
+
+        for feed_id in feed_ids:
+            feed = Feed.get_by_id(feed_id)
+            if not feed: continue
            
+            feed.fill_out_archive_stories()
+        
+        r.publish(user.username, 'fetch_archive:feeds:%s' % 
+                  ','.join([str(f) for f in feed_ids]))
+
+    @classmethod
+    def finish_fetch_archive_feeds(cls, user_id, start_time):
+        r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL)
+        user = User.objects.get(pk=user_id)
+        subscriptions = UserSubscription.objects.filter(user=user).only('feed')
+        total = subscriptions.count()
+        duration = time.time() - start_time
+
+        ending_story_count = 0
+        for sub in subscriptions:
+            try:
+                ending_story_count += MStory.objects(story_feed_id=sub.feed.pk).count()
+            except Feed.DoesNotExist:
+                continue
+
+        logging.user(user, "~FCFetched archive stories from ~SB%s feeds~SN in ~FM~SB%s~FC~SN sec." % 
+                     (total, round(duration, 2)))
+        r.publish(user.username, 'fetch_archive:done')
+
+        return ending_story_count
+        
+    
    @classmethod
    def identify_deleted_feed_users(cls, old_feed_id):
        users = UserSubscriptionFolders.objects.filter(folders__contains=old_feed_id).only('user')
@ -1126,7 +1208,8 @@ class UserSubscription(models.Model):
        
        return table
        # return cofeeds
-        
+
+
 class RUserStory:
    
    @classmethod
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@ -608,6 +608,7 @@ class Feed(models.Model):
            r.zremrangebyrank('error_feeds', 0, -1)
        else:
            logging.debug(" ---> No errored feeds to drain")
+
    def update_all_statistics(self, has_new_stories=False, force=False):
        recount = not self.counts_converted_to_redis        
        count_extra = False
@ -1032,12 +1033,25 @@ class Feed(models.Model):
            else:
                return 'black'

-    def fill_out_archive_stories(self):
+    def fill_out_archive_stories(self, force=False):
        """
        Starting from page 1 and iterating through N pages, determine whether
        page(i) matches page(i-1) and if there are any new stories.
        """
+        before_story_count = MStory.objects(story_feed_id=self.pk).count()
+
+        if not force and not self.archive_subscribers:
+            logging.debug("   ---> [%-30s] ~FBNot filling out archive stories, no archive subscribers" % (
+                          self.log_title[:30]))
+            return before_story_count, before_story_count
+
        self.update(archive_page=1)
+
+        after_story_count = MStory.objects(story_feed_id=self.pk).count()
+        logging.debug("   ---> [%-30s] ~FCFilled out archive, ~FM~SB%s~SN new stories~FC, total of ~SB%s~SN stories" % (
+                        self.log_title[:30],
+                        after_story_count - before_story_count,
+                        after_story_count))
        
    def save_feed_stories_last_month(self, verbose=False):
        month_ago = datetime.datetime.utcnow() - datetime.timedelta(days=30)
--- a/utils/feed_fetcher.py
+++ b/utils/feed_fetcher.py
@ -503,11 +503,11 @@ class ProcessFeed:
            if not story['title'] and not story['story_content']:
                continue
            if self.options.get('archive_page', None) and story.get('published') > day_ago:
-                # Arbitrary but necessary to prevent broken feeds from creating an unlimited number of stories
+                # Archive only: Arbitrary but necessary to prevent feeds from creating an unlimited number of stories
                # because they don't have a guid so it gets auto-generated based on the date, and if the story
                # is missing a date, then the latest date gets used. So reject anything newer than 24 hours old
                # when filling out the archive.
-                logging.debug(f"   ---> [%-30s] ~FBTossing story because it's too new for the archive: ~SB{story}")
+                # logging.debug(f"   ---> [%-30s] ~FBTossing story because it's too new for the archive: ~SB{story}")
                continue
            if story.get('published') < start_date:
                start_date = story.get('published')
@ -1142,10 +1142,10 @@ class FeedFetcherWorker:
        # time_taken = datetime.datetime.utcnow() - self.time_start

    def fetch_and_process_archive_pages(self, feed_id):
-        seen_story_hashes = set()
        feed = Feed.get_by_id(feed_id)
        
        for archive_page_key in ["page", "paged"]:
+            seen_story_hashes = set()
            failed_pages = 0
            self.options['archive_page_key'] = archive_page_key

@ -1155,7 +1155,13 @@ class FeedFetcherWorker:
                self.options['archive_page'] = page+1

                ffeed = FetchFeed(feed_id, self.options)
-                ret_feed, fetched_feed = ffeed.fetch()
+                try:
+                    ret_feed, fetched_feed = ffeed.fetch()
+                except TimeoutError as e:
+                    logging.debug('   ---> [%-30s] ~FRFeed fetch timed out...' % (feed.log_title[:30]))
+                    failed_pages += 1
+                    continue
+
                raw_feed = ffeed.raw_feed

                if fetched_feed and ret_feed == FEED_OK:
@ -1169,12 +1175,13 @@ class FeedFetcherWorker:
                    seen_story_hashes.update(pfeed.archive_seen_story_hashes)
                    after_story_hashes = len(seen_story_hashes)

-                    logging.debug(f"   ---> [{feed.log_title[:30]:<30}] ~FBStory hashes found: ~SB{len(seen_story_hashes)} stories, ~SN~FR{failed_pages}~FB failures")
                    if before_story_hashes == after_story_hashes:
                        failed_pages += 1
                else:
                    failed_pages += 1

+                logging.debug(f"   ---> [{feed.log_title[:30]:<30}] ~FBStory hashes found: ~SB{len(seen_story_hashes)} stories, ~SN~FR{failed_pages}~FB failures")
+
    def publish_to_subscribers(self, feed, new_count):
        try:
            r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL)