High volume feeds

2025-09-18 21:50:56 +00:00 · 2025-08-20 22:12:47 -07:00 · 2025-08-20 22:12:47 -07:00 · 66c5355eac
commit 66c5355eac
parent e632f13c68
1 changed files with 15 additions and 3 deletions
--- a/utils/feed_fetcher.py
+++ b/utils/feed_fetcher.py
@ -143,6 +143,8 @@ def preprocess_feed_encoding(raw_xml):
 # http://feedjack.googlecode.com

 MAX_ENTRIES_TO_PROCESS = 100
+MAX_ENTRIES_HIGH_VOLUME = 250
+HIGH_VOLUME_FEED_URLS = ['arxiv.org']  # Feeds that can handle more stories per fetch

 FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = list(range(5))

@ -566,10 +568,20 @@ class ProcessFeed:
                        )

        self.feed_entries = self.fpf.entries
-        # If there are more than 100 entries, we should sort the entries in date descending order and cut them off
-        if len(self.feed_entries) > MAX_ENTRIES_TO_PROCESS:
+        
+        # Check if this is a high-volume feed that can handle more stories
+        max_entries = MAX_ENTRIES_TO_PROCESS
+        feed_address_lower = self.feed.feed_address.lower()
+        for high_volume_url in HIGH_VOLUME_FEED_URLS:
+            if high_volume_url in feed_address_lower:
+                max_entries = MAX_ENTRIES_HIGH_VOLUME
+                logging.debug(f"   ---> [{self.feed.log_title[:30]:<30}] High-volume feed detected ({high_volume_url}), allowing up to {max_entries} stories")
+                break
+        
+        # If there are more than max_entries, we should sort the entries in date descending order and cut them off
+        if len(self.feed_entries) > max_entries:
            self.feed_entries = sorted(self.feed_entries, key=lambda x: extract_story_date(x), reverse=True)[
-                :MAX_ENTRIES_TO_PROCESS
+                :max_entries
            ]

        if not self.options.get("archive_page", None):