High volume feeds

This commit is contained in:
Samuel Clay 2025-08-20 22:12:47 -07:00
parent e632f13c68
commit 66c5355eac

View file

@ -143,6 +143,8 @@ def preprocess_feed_encoding(raw_xml):
# http://feedjack.googlecode.com # http://feedjack.googlecode.com
MAX_ENTRIES_TO_PROCESS = 100 MAX_ENTRIES_TO_PROCESS = 100
MAX_ENTRIES_HIGH_VOLUME = 250
HIGH_VOLUME_FEED_URLS = ['arxiv.org'] # Feeds that can handle more stories per fetch
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = list(range(5)) FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = list(range(5))
@ -566,10 +568,20 @@ class ProcessFeed:
) )
self.feed_entries = self.fpf.entries self.feed_entries = self.fpf.entries
# If there are more than 100 entries, we should sort the entries in date descending order and cut them off
if len(self.feed_entries) > MAX_ENTRIES_TO_PROCESS: # Check if this is a high-volume feed that can handle more stories
max_entries = MAX_ENTRIES_TO_PROCESS
feed_address_lower = self.feed.feed_address.lower()
for high_volume_url in HIGH_VOLUME_FEED_URLS:
if high_volume_url in feed_address_lower:
max_entries = MAX_ENTRIES_HIGH_VOLUME
logging.debug(f" ---> [{self.feed.log_title[:30]:<30}] High-volume feed detected ({high_volume_url}), allowing up to {max_entries} stories")
break
# If there are more than max_entries, we should sort the entries in date descending order and cut them off
if len(self.feed_entries) > max_entries:
self.feed_entries = sorted(self.feed_entries, key=lambda x: extract_story_date(x), reverse=True)[ self.feed_entries = sorted(self.feed_entries, key=lambda x: extract_story_date(x), reverse=True)[
:MAX_ENTRIES_TO_PROCESS :max_entries
] ]
if not self.options.get("archive_page", None): if not self.options.get("archive_page", None):