Adding a massive feed trimmer. Cuts feeds, those that have no active subs and haven't a new story for a month, down to 10 stories.

2025-09-18 21:43:31 +00:00 · 2013-06-03 17:20:36 -07:00 · 2013-06-03 17:20:36 -07:00 · edcbde615d
commit edcbde615d
parent e84c31884a
2 changed files with 54 additions and 34 deletions
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@ -235,7 +235,7 @@ class Feed(models.Model):

        r.expire('F:%s' % self.pk, settings.DAYS_OF_UNREAD*24*60*60)
        r.expire('zF:%s' % self.pk, settings.DAYS_OF_UNREAD*24*60*60)
-
+    
    @classmethod
    def autocomplete(self, prefix, limit=5):
        results = SearchQuerySet().autocomplete(address=prefix).order_by('-num_subscribers')[:limit]
@ -1041,33 +1041,44 @@ class Feed(models.Model):

        if len(feed_authors) > 1:
            self.save_popular_authors(feed_authors=feed_authors[:-1])
-            
-    def trim_feed(self, verbose=False):
-        trim_cutoff = 500
-        if self.active_subscribers <= 0:
-            trim_cutoff = 25
-        elif self.num_subscribers <= 10 or self.active_premium_subscribers <= 1:
-            trim_cutoff = 100
-        elif self.num_subscribers <= 30  or self.active_premium_subscribers <= 3:
-            trim_cutoff = 200
-        elif self.num_subscribers <= 50  or self.active_premium_subscribers <= 5:
-            trim_cutoff = 300
-        elif self.num_subscribers <= 100 or self.active_premium_subscribers <= 10:
-            trim_cutoff = 350
-        elif self.num_subscribers <= 150 or self.active_premium_subscribers <= 15:
-            trim_cutoff = 400
-        elif self.num_subscribers <= 200 or self.active_premium_subscribers <= 20:
-            trim_cutoff = 450
+
+    @classmethod
+    def trim_old_stories(cls, page=1, verbose=True):
+        limit = 100
+        month_ago = datetime.datetime.now() - datetime.timedelta(days=settings.DAYS_OF_UNREAD*2)
+        old_feeds = cls.objects.filter(active_subscribers__lte=0,
+                                       last_story_date__lte=month_ago)[page*limit:(page+1)*limit]
+        logging.debug(" ---> Trimming %s/p%s old feeds..." % (old_feeds.count(), page))
+        for feed in old_feeds:
+            feed.trim_feed(verbose=verbose, cutoff=10)
+
+    def trim_feed(self, verbose=False, cutoff=None):
+        if not cutoff:
+            cutoff = 500
+            if self.active_subscribers <= 0:
+                cutoff = 25
+            elif self.num_subscribers <= 10 or self.active_premium_subscribers <= 1:
+                cutoff = 100
+            elif self.num_subscribers <= 30  or self.active_premium_subscribers <= 3:
+                cutoff = 200
+            elif self.num_subscribers <= 50  or self.active_premium_subscribers <= 5:
+                cutoff = 300
+            elif self.num_subscribers <= 100 or self.active_premium_subscribers <= 10:
+                cutoff = 350
+            elif self.num_subscribers <= 150 or self.active_premium_subscribers <= 15:
+                cutoff = 400
+            elif self.num_subscribers <= 200 or self.active_premium_subscribers <= 20:
+                cutoff = 450
            
        stories = MStory.objects(
            story_feed_id=self.pk,
        ).order_by('-story_date')
        
-        if stories.count() > trim_cutoff:
+        if stories.count() > cutoff:
            logging.debug('   ---> [%-30s] ~FBFound %s stories. Trimming to ~SB%s~SN...' %
-                          (unicode(self)[:30], stories.count(), trim_cutoff))
+                          (unicode(self)[:30], stories.count(), cutoff))
            try:
-                story_trim_date = stories[trim_cutoff].story_date
+                story_trim_date = stories[cutoff].story_date
            except IndexError, e:
                logging.debug(' ***> [%-30s] ~BRError trimming feed: %s' % (unicode(self)[:30], e))
                return
@ -1079,8 +1090,9 @@ class Feed(models.Model):
                story.delete()
            if verbose:
                existing_story_count = MStory.objects(story_feed_id=self.pk).count()
-                print "Deleted %s stories, %s left." % (extra_stories_count,
-                                                        existing_story_count)
+                logging.debug("   ---> Deleted %s stories, %s left." % (
+                                extra_stories_count,
+                                existing_story_count))

    # @staticmethod
    # def clean_invalid_ids():
--- a/apps/statistics/rstats.py
+++ b/apps/statistics/rstats.py
@ -68,20 +68,28 @@ class RStats:
    def sample(cls, sample=1000, pool=None):
        if not pool:
            pool = settings.REDIS_STORY_HASH_POOL
-        r = redis.Redis(connection_pool=pool)
-        
-        p = r.pipeline()
-        keys = set()
-        prefixes = defaultdict(set)
+
+        r             = redis.Redis(connection_pool=pool)
+        keys          = set()
+        errors        = set()
+        prefixes      = defaultdict(set)
        prefixes_ttls = defaultdict(lambda: defaultdict(int))
-        prefix_re = re.compile(r"(\w+):(.*)")
+        prefix_re     = re.compile(r"(\w+):(.*)")
+
+        p             = r.pipeline()
        [p.randomkey() for _ in range(sample)]
-        keys = set(p.execute())
-        p = r.pipeline()
+        keys          = set(p.execute())
+        p             = r.pipeline()
+
        [p.ttl(key) for key in keys]
-        ttls = p.execute()
+        ttls          = p.execute()
+        
        for k, key in enumerate(keys):
-            prefix, rest = prefix_re.match(key).groups()
+            match = prefix_re.match(key)
+            if not match:
+                errors.add(key)
+                continue
+            prefix, rest = match.groups()
            prefixes[prefix].add(rest)
            ttl = ttls[k]
            if ttl < 60*60: # 1 hour
@ -101,7 +109,7 @@ class RStats:
        print " ---> %s total keys" % keys_count
        for prefix, rest in prefixes.items():
            print " ---> %4s: (%.4s%%) %s keys (%s)" % (prefix, 100. * (len(rest) / float(keys_count)), len(rest), dict(prefixes_ttls[prefix]))
-        
+        print " ---> %s errors: %s" % (len(errors), errors)

 def round_time(dt=None, round_to=60):
   """Round a datetime object to any time laps in seconds