NewsBlur-viq/utils/feed_fetcher.py

1289 lines
56 KiB
Python
Raw Normal View History

import datetime
import multiprocessing
2024-03-02 08:37:20 -05:00
import time
import traceback
import django
2022-03-02 10:55:46 -05:00
django.setup()
2024-03-02 08:37:20 -05:00
import http
import http.client
import urllib.error
import urllib.parse
import urllib.request
2022-03-02 10:55:46 -05:00
http.client._MAXHEADERS = 10000
import random
import re
2024-03-02 08:37:20 -05:00
import xml.sax
import feedparser
import pymongo
import redis
import requests
from django.conf import settings
from django.core.cache import cache
2024-03-02 08:37:20 -05:00
from django.db import IntegrityError
from sentry_sdk import set_user
2024-03-02 08:37:20 -05:00
from apps.notifications.models import MUserFeedNotification
from apps.notifications.tasks import QueueNotifications
from apps.push.models import PushSubscription
2013-05-10 16:11:30 -07:00
from apps.reader.models import UserSubscription
2024-03-02 08:37:20 -05:00
from apps.rss_feeds.icon_importer import IconImporter
from apps.rss_feeds.models import Feed, MStory
from apps.rss_feeds.page_importer import PageImporter
from apps.statistics.models import MAnalyticsFetcher, MStatistics
2021-07-16 11:20:59 -04:00
2024-04-24 09:43:56 -04:00
feedparser.sanitizer._HTMLSanitizer.acceptable_elements.update(["iframe"])
feedparser.sanitizer._HTMLSanitizer.acceptable_elements.update(["text"])
2021-07-16 11:20:59 -04:00
from bs4 import BeautifulSoup
2024-03-02 08:37:20 -05:00
from celery.exceptions import SoftTimeLimitExceeded
from django.utils import feedgenerator
from django.utils.encoding import smart_str
2024-03-02 08:37:20 -05:00
from django.utils.html import linebreaks
from mongoengine import connect, connection
from qurl import qurl
from sentry_sdk import capture_exception, flush
from utils import json_functions as json
2024-03-02 08:37:20 -05:00
from utils import log as logging
from utils.facebook_fetcher import FacebookFetcher
from utils.feed_functions import TimeoutError, strip_underscore_from_feed_address, timelimit
2017-05-22 16:46:56 -07:00
from utils.json_fetcher import JSONFetcher
2024-03-02 08:37:20 -05:00
from utils.story_functions import linkify, pre_process_story, strip_tags
from utils.twitter_fetcher import TwitterFetcher
from utils.youtube_fetcher import YoutubeFetcher
2022-03-02 10:55:46 -05:00
# from utils.feed_functions import mail_feed_error_to_admin
# Refresh feed code adapted from Feedjack.
# http://feedjack.googlecode.com
2020-06-13 13:13:20 -04:00
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = list(range(5))
2022-03-02 10:55:46 -05:00
NO_UNDERSCORE_ADDRESSES = ["jwz"]
2022-03-02 10:55:46 -05:00
class FetchFeed:
def __init__(self, feed_id, options):
self.feed = Feed.get_by_id(feed_id)
self.options = options
self.fpf = None
self.raw_feed = None
2022-03-02 10:55:46 -05:00
@timelimit(45)
def fetch(self):
2022-03-02 10:55:46 -05:00
"""
Uses requests to download the feed, parsing it in feedparser. Will be storified later.
"""
start = time.time()
identity = self.get_identity()
2024-04-24 09:43:56 -04:00
if self.options.get("archive_page", None):
log_msg = "%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY) ~BG~FMarchive page~ST~FY: ~SB%s" % (
identity,
self.feed.log_title[:30],
self.feed.id,
2024-04-24 09:43:56 -04:00
self.options["archive_page"],
)
else:
2024-04-24 09:43:56 -04:00
log_msg = "%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s" % (
identity,
self.feed.log_title[:30],
self.feed.id,
datetime.datetime.now() - self.feed.last_update,
)
logging.debug(log_msg)
2022-03-02 10:55:46 -05:00
2017-05-22 16:46:56 -07:00
etag = self.feed.etag
modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
address = self.feed.feed_address
2022-03-02 10:55:46 -05:00
2024-04-24 09:43:56 -04:00
if self.options.get("force") or self.options.get("archive_page", None) or random.random() <= 0.01:
self.options["force"] = True
modified = None
etag = None
2024-04-24 09:43:56 -04:00
if self.options.get("archive_page", None) == "rfc5005" and self.options.get(
"archive_page_link", None
):
address = self.options["archive_page_link"]
elif self.options.get("archive_page", None):
address = qurl(address, add={self.options["archive_page_key"]: self.options["archive_page"]})
# Don't use the underscore cache buster: https://forum.newsblur.com/t/jwz-feed-broken-hes-mad-about-url-parameters/10742/15
# elif address.startswith("http") and not any(item in address for item in NO_UNDERSCORE_ADDRESSES):
# address = qurl(address, add={"_": random.randint(0, 10000)})
2024-04-24 09:43:56 -04:00
logging.debug(" ---> [%-30s] ~FBForcing fetch: %s" % (self.feed.log_title[:30], address))
2022-03-02 10:55:46 -05:00
elif not self.feed.fetched_once or not self.feed.known_good:
modified = None
etag = None
2022-03-02 10:55:46 -05:00
2024-04-24 09:43:56 -04:00
if self.options.get("feed_xml"):
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s"
% (self.feed.log_title[:30], len(self.options.get("feed_xml")))
2022-03-02 10:55:46 -05:00
)
2024-04-24 09:43:56 -04:00
if self.options.get("fpf"):
self.fpf = self.options.get("fpf")
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping." % (self.feed.log_title[:30])
2022-03-02 10:55:46 -05:00
)
return FEED_OK, self.fpf
2018-04-03 12:38:49 -07:00
2024-04-24 09:43:56 -04:00
if "youtube.com" in address:
youtube_feed = self.fetch_youtube()
if not youtube_feed:
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ***> [%-30s] ~FRYouTube fetch failed: %s." % (self.feed.log_title[:30], address)
2022-03-02 10:55:46 -05:00
)
return FEED_ERRHTTP, None
2021-04-19 16:11:02 -04:00
self.fpf = feedparser.parse(youtube_feed, sanitize_html=False)
2024-04-24 09:43:56 -04:00
elif re.match(r"(https?)?://twitter.com/\w+/?", qurl(address, remove=["_"])):
twitter_feed = self.fetch_twitter(address)
if not twitter_feed:
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ***> [%-30s] ~FRTwitter fetch failed: %s" % (self.feed.log_title[:30], address)
2022-03-02 10:55:46 -05:00
)
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(twitter_feed)
2024-04-24 09:43:56 -04:00
elif re.match(r"(.*?)facebook.com/\w+/?$", qurl(address, remove=["_"])):
facebook_feed = self.fetch_facebook()
if not facebook_feed:
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ***> [%-30s] ~FRFacebook fetch failed: %s" % (self.feed.log_title[:30], address)
2022-03-02 10:55:46 -05:00
)
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(facebook_feed)
2022-03-02 10:55:46 -05:00
2024-04-24 09:43:56 -04:00
if not self.fpf and "json" in address:
try:
headers = self.feed.fetch_headers()
if etag:
2024-04-24 09:43:56 -04:00
headers["If-None-Match"] = etag
if modified:
# format into an RFC 1123-compliant timestamp. We can't use
# time.strftime() since the %a and %b directives can be affected
# by the current locale, but RFC 2616 states that dates must be
# in English.
2024-04-24 09:43:56 -04:00
short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
2022-03-02 10:55:46 -05:00
months = [
2024-04-24 09:43:56 -04:00
"Jan",
"Feb",
"Mar",
"Apr",
"May",
"Jun",
"Jul",
"Aug",
"Sep",
"Oct",
"Nov",
"Dec",
2022-03-02 10:55:46 -05:00
]
2024-04-24 09:43:56 -04:00
modified_header = "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
2022-03-02 10:55:46 -05:00
short_weekdays[modified[6]],
modified[2],
months[modified[1] - 1],
modified[0],
modified[3],
modified[4],
modified[5],
)
2024-04-24 09:43:56 -04:00
headers["If-Modified-Since"] = modified_header
if etag or modified:
2024-04-24 09:43:56 -04:00
headers["A-IM"] = "feed"
try:
raw_feed = requests.get(address, headers=headers, timeout=15)
except (requests.adapters.ConnectionError, TimeoutError):
raw_feed = None
if not raw_feed or raw_feed.status_code >= 400:
2023-05-09 09:35:56 -04:00
if raw_feed:
logging.debug(
" ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s"
% (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers)
)
else:
2024-04-24 09:43:56 -04:00
logging.debug(
" ***> [%-30s] ~FRJson feed fetch timed out, trying fake headers: %s"
% (self.feed.log_title[:30], address)
)
2022-03-02 10:55:46 -05:00
raw_feed = requests.get(
self.feed.feed_address,
headers=self.feed.fetch_headers(fake=True),
timeout=15,
)
json_feed_content_type = any(
2024-04-24 09:43:56 -04:00
json_feed in raw_feed.headers.get("Content-Type", "")
for json_feed in ["application/feed+json", "application/json"]
2022-03-02 10:55:46 -05:00
)
2021-04-02 13:27:33 -04:00
if raw_feed.content and json_feed_content_type:
2017-05-22 16:46:56 -07:00
# JSON Feed
json_feed = self.fetch_json_feed(address, raw_feed)
if not json_feed:
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ***> [%-30s] ~FRJSON fetch failed: %s" % (self.feed.log_title[:30], address)
2022-03-02 10:55:46 -05:00
)
2017-05-22 16:46:56 -07:00
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(json_feed)
elif raw_feed.content and raw_feed.status_code < 400:
response_headers = raw_feed.headers
2024-04-24 09:43:56 -04:00
response_headers["Content-Location"] = raw_feed.url
self.raw_feed = smart_str(raw_feed.content)
2022-03-02 10:55:46 -05:00
self.fpf = feedparser.parse(self.raw_feed, response_headers=response_headers)
2024-04-24 09:43:56 -04:00
if self.options["verbose"]:
2022-03-02 10:55:46 -05:00
logging.debug(
" ---> [%-30s] ~FBFeed fetch status %s: %s length / %s"
% (
self.feed.log_title[:30],
raw_feed.status_code,
len(smart_str(raw_feed.content)),
raw_feed.headers,
)
)
2020-06-13 13:13:20 -04:00
except Exception as e:
2022-03-02 10:55:46 -05:00
logging.debug(
" ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s"
% (self.feed.log_title[:30], str(e))
)
2021-07-29 17:25:09 -04:00
# raise e
2022-03-02 10:55:46 -05:00
2024-04-24 09:43:56 -04:00
if not self.fpf or self.options.get("force_fp", False):
try:
2022-03-02 10:55:46 -05:00
self.fpf = feedparser.parse(address, agent=self.feed.user_agent, etag=etag, modified=modified)
except (
TypeError,
ValueError,
KeyError,
EOFError,
MemoryError,
urllib.error.URLError,
http.client.InvalidURL,
http.client.BadStatusLine,
http.client.IncompleteRead,
ConnectionResetError,
2023-05-09 09:35:56 -04:00
TimeoutError,
2022-03-02 10:55:46 -05:00
) as e:
2024-04-24 09:43:56 -04:00
logging.debug(" ***> [%-30s] ~FRFeed fetch error: %s" % (self.feed.log_title[:30], e))
pass
2022-03-02 10:55:46 -05:00
if not self.fpf:
try:
2024-04-24 09:43:56 -04:00
logging.debug(
" ***> [%-30s] ~FRTurning off headers: %s" % (self.feed.log_title[:30], address)
)
self.fpf = feedparser.parse(address, agent=self.feed.user_agent)
2022-03-02 10:55:46 -05:00
except (
TypeError,
ValueError,
KeyError,
EOFError,
MemoryError,
urllib.error.URLError,
http.client.InvalidURL,
http.client.BadStatusLine,
http.client.IncompleteRead,
ConnectionResetError,
) as e:
2024-04-24 09:43:56 -04:00
logging.debug(" ***> [%-30s] ~FRFetch failed: %s." % (self.feed.log_title[:30], e))
return FEED_ERRHTTP, None
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FYFeed fetch in ~FM%.4ss" % (self.feed.log_title[:30], time.time() - start)
2022-03-02 10:55:46 -05:00
)
return FEED_OK, self.fpf
2022-03-02 10:55:46 -05:00
def get_identity(self):
identity = "X"
current_process = multiprocessing.current_process()
if current_process._identity:
identity = current_process._identity[0]
return identity
2022-03-02 10:55:46 -05:00
def fetch_twitter(self, address=None):
twitter_fetcher = TwitterFetcher(self.feed, self.options)
return twitter_fetcher.fetch(address)
2022-03-02 10:55:46 -05:00
def fetch_facebook(self):
facebook_fetcher = FacebookFetcher(self.feed, self.options)
return facebook_fetcher.fetch()
2022-03-02 10:55:46 -05:00
2017-05-22 16:46:56 -07:00
def fetch_json_feed(self, address, headers):
json_fetcher = JSONFetcher(self.feed, self.options)
return json_fetcher.fetch(address, headers)
2022-03-02 10:55:46 -05:00
def fetch_youtube(self):
youtube_fetcher = YoutubeFetcher(self.feed, self.options)
return youtube_fetcher.fetch()
2022-03-02 10:55:46 -05:00
class ProcessFeed:
def __init__(self, feed_id, fpf, options, raw_feed=None):
2010-10-03 18:19:23 -04:00
self.feed_id = feed_id
self.options = options
self.fpf = fpf
self.raw_feed = raw_feed
self.archive_seen_story_hashes = set()
2022-03-02 10:55:46 -05:00
2010-10-03 18:19:23 -04:00
def refresh_feed(self):
self.feed = Feed.get_by_id(self.feed_id)
2012-11-26 10:27:08 -08:00
if self.feed_id != self.feed.pk:
logging.debug(" ***> Feed has changed: from %s to %s" % (self.feed_id, self.feed.pk))
self.feed_id = self.feed.pk
2022-03-02 10:55:46 -05:00
def process(self):
2022-03-02 10:55:46 -05:00
"""Downloads and parses a feed."""
start = time.time()
2010-10-03 18:19:23 -04:00
self.refresh_feed()
2022-03-02 10:55:46 -05:00
2024-04-24 09:43:56 -04:00
if not self.options.get("archive_page", None):
feed_status, ret_values = self.verify_feed_integrity()
if feed_status and ret_values:
return feed_status, ret_values
2024-04-24 09:43:56 -04:00
self.fpf.entries = self.fpf.entries[:100]
2024-04-24 09:43:56 -04:00
if not self.options.get("archive_page", None):
self.compare_feed_attribute_changes()
# Determine if stories aren't valid and replace broken guids
guids_seen = set()
permalinks_seen = set()
for entry in self.fpf.entries:
2024-04-24 09:43:56 -04:00
guids_seen.add(entry.get("guid"))
permalinks_seen.add(Feed.get_permalink(entry))
guid_difference = len(guids_seen) != len(self.fpf.entries)
single_guid = len(guids_seen) == 1
replace_guids = single_guid and guid_difference
permalink_difference = len(permalinks_seen) != len(self.fpf.entries)
single_permalink = len(permalinks_seen) == 1
replace_permalinks = single_permalink and permalink_difference
# Compare new stories to existing stories, adding and updating
start_date = datetime.datetime.utcnow()
day_ago = datetime.datetime.now() - datetime.timedelta(days=1)
story_hashes = []
stories = []
for entry in self.fpf.entries:
story = pre_process_story(entry, self.fpf.encoding)
2024-04-24 09:43:56 -04:00
if not story["title"] and not story["story_content"]:
continue
2024-04-24 09:43:56 -04:00
if self.options.get("archive_page", None) and story.get("published") > day_ago:
# Archive only: Arbitrary but necessary to prevent feeds from creating an unlimited number of stories
# because they don't have a guid so it gets auto-generated based on the date, and if the story
# is missing a date, then the latest date gets used. So reject anything newer than 24 hours old
# when filling out the archive.
# logging.debug(f" ---> [%-30s] ~FBTossing story because it's too new for the archive: ~SB{story}")
continue
2024-04-24 09:43:56 -04:00
if story.get("published") < start_date:
start_date = story.get("published")
if replace_guids:
if replace_permalinks:
2024-04-24 09:43:56 -04:00
new_story_guid = str(story.get("published"))
if self.options["verbose"]:
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s"
% (self.feed.log_title[:30], story.get("guid"), new_story_guid)
)
2024-04-24 09:43:56 -04:00
story["guid"] = new_story_guid
else:
new_story_guid = Feed.get_permalink(story)
2024-04-24 09:43:56 -04:00
if self.options["verbose"]:
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s"
% (self.feed.log_title[:30], story.get("guid"), new_story_guid)
)
2024-04-24 09:43:56 -04:00
story["guid"] = new_story_guid
story["story_hash"] = MStory.feed_guid_hash_unsaved(self.feed.pk, story.get("guid"))
stories.append(story)
2024-04-24 09:43:56 -04:00
story_hashes.append(story.get("story_hash"))
original_story_hash_count = len(story_hashes)
story_hashes_in_unread_cutoff = self.feed.story_hashes_in_unread_cutoff[:original_story_hash_count]
story_hashes.extend(story_hashes_in_unread_cutoff)
story_hashes = list(set(story_hashes))
2024-04-24 09:43:56 -04:00
if self.options["verbose"] or settings.DEBUG:
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FBFound ~SB%s~SN guids, adding ~SB%s~SN/%s guids from db"
% (
self.feed.log_title[:30],
original_story_hash_count,
len(story_hashes) - original_story_hash_count,
len(story_hashes_in_unread_cutoff),
)
)
existing_stories = dict(
(s.story_hash, s)
for s in MStory.objects(
story_hash__in=story_hashes,
# story_date__gte=start_date,
# story_feed_id=self.feed.pk
)
)
# if len(existing_stories) == 0:
# existing_stories = dict((s.story_hash, s) for s in MStory.objects(
# story_date__gte=start_date,
# story_feed_id=self.feed.pk
# ))
ret_values = self.feed.add_update_stories(
stories,
existing_stories,
2024-04-24 09:43:56 -04:00
verbose=self.options["verbose"],
updates_off=self.options["updates_off"],
)
# PubSubHubbub
2024-04-24 09:43:56 -04:00
if not self.options.get("archive_page", None):
self.check_feed_for_push()
# Push notifications
2024-04-24 09:43:56 -04:00
if ret_values["new"] > 0 and MUserFeedNotification.feed_has_users(self.feed.pk) > 0:
QueueNotifications.delay(self.feed.pk, ret_values["new"])
# All Done
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s"
% (
self.feed.log_title[:30],
2024-04-24 09:43:56 -04:00
"~FG~SB" if ret_values["new"] else "",
ret_values["new"],
"~FY~SB" if ret_values["updated"] else "",
ret_values["updated"],
"~SB" if ret_values["same"] else "",
ret_values["same"],
"~FR~SB" if ret_values["error"] else "",
ret_values["error"],
len(self.fpf.entries),
)
)
2024-04-24 09:43:56 -04:00
self.feed.update_all_statistics(has_new_stories=bool(ret_values["new"]), force=self.options["force"])
fetch_date = datetime.datetime.now()
2024-04-24 09:43:56 -04:00
if ret_values["new"]:
if not getattr(settings, "TEST_DEBUG", False):
self.feed.trim_feed()
self.feed.expire_redis()
2024-04-24 09:43:56 -04:00
if MStatistics.get("raw_feed", None) == self.feed.pk:
self.feed.save_raw_feed(self.raw_feed, fetch_date)
self.feed.save_feed_history(200, "OK", date=fetch_date)
2024-04-24 09:43:56 -04:00
if self.options["verbose"]:
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss"
% (self.feed.log_title[:30], time.time() - start)
)
2024-04-24 09:43:56 -04:00
if self.options.get("archive_page", None):
self.archive_seen_story_hashes.update(story_hashes)
2024-04-24 09:43:56 -04:00
return FEED_OK, ret_values
def verify_feed_integrity(self):
"""Ensures stories come through and any abberant status codes get saved
Returns:
FEED_STATUS: enum
ret_values: dictionary of counts of new, updated, same, and error stories
"""
ret_values = dict(new=0, updated=0, same=0, error=0)
2022-07-01 13:37:55 -04:00
if not self.feed:
return FEED_ERREXC, ret_values
2024-04-24 09:43:56 -04:00
if hasattr(self.fpf, "status"):
if self.options["verbose"]:
if self.fpf.bozo and self.fpf.status != 304:
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)"
2022-03-02 10:55:46 -05:00
% (self.feed.log_title[:30], self.fpf.bozo_exception, len(self.fpf.entries))
)
if self.fpf.status == 304:
self.feed = self.feed.save()
self.feed.save_feed_history(304, "Not modified")
return FEED_SAME, ret_values
2022-03-02 10:55:46 -05:00
2017-08-16 09:17:23 +02:00
# 302 and 307: Temporary redirect: ignore
# 301 and 308: Permanent redirect: save it (after 10 tries)
if self.fpf.status == 301 or self.fpf.status == 308:
2024-04-24 09:43:56 -04:00
if self.fpf.href.endswith("feedburner.com/atom.xml"):
return FEED_ERRHTTP, ret_values
2024-04-24 09:43:56 -04:00
redirects, non_redirects = self.feed.count_redirects_in_history("feed")
2022-03-02 10:55:46 -05:00
self.feed.save_feed_history(
self.fpf.status, "HTTP Redirect (%d to go)" % (10 - len(redirects))
)
if len(redirects) >= 10 or len(non_redirects) == 0:
address = self.fpf.href
2024-04-24 09:43:56 -04:00
if self.options["force"] and address:
address = qurl(address, remove=["_"])
self.feed.feed_address = strip_underscore_from_feed_address(address)
if not self.feed.known_good:
self.feed.fetched_once = True
2022-03-02 10:55:46 -05:00
logging.debug(
" ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..."
% (self.feed.log_title[:30], self.fpf.status)
)
self.feed = self.feed.schedule_feed_fetch_immediately()
2010-10-06 22:34:28 -04:00
if not self.fpf.entries:
self.feed = self.feed.save()
2010-10-06 22:34:28 -04:00
self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
return FEED_ERRHTTP, ret_values
if self.fpf.status >= 400:
2022-03-02 10:55:46 -05:00
logging.debug(
" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
% (self.feed.log_title[:30], self.fpf.status)
)
fixed_feed = None
if not self.feed.known_good:
fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed:
self.feed.save_feed_history(self.fpf.status, "HTTP Error")
else:
self.feed = feed
self.feed = self.feed.save()
return FEED_ERRHTTP, ret_values
2022-03-02 10:55:46 -05:00
2016-02-03 12:54:34 -08:00
if not self.fpf:
2022-03-02 10:55:46 -05:00
logging.debug(
" ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!"
% (self.feed.log_title[:30])
)
2016-02-03 12:54:34 -08:00
self.feed.save_feed_history(551, "Broken feed")
return FEED_ERRHTTP, ret_values
2017-05-22 16:46:56 -07:00
2016-02-03 12:54:34 -08:00
if self.fpf and not self.fpf.entries:
2012-03-27 11:19:53 -07:00
if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
2022-03-02 10:55:46 -05:00
logging.debug(
" ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..."
% (self.feed.log_title[:30], len(self.fpf.entries))
)
fixed_feed = None
if not self.feed.known_good:
fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed:
2024-04-24 09:43:56 -04:00
self.feed.save_feed_history(552, "Non-xml feed", self.fpf.bozo_exception)
else:
self.feed = feed
self.feed = self.feed.save()
return FEED_ERRPARSE, ret_values
2012-03-27 11:19:53 -07:00
elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
2022-03-02 10:55:46 -05:00
logging.debug(
" ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..."
% (self.feed.log_title[:30], len(self.fpf.entries))
)
fixed_feed = None
if not self.feed.known_good:
fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed:
2024-04-24 09:43:56 -04:00
self.feed.save_feed_history(553, "Not an RSS feed", self.fpf.bozo_exception)
else:
self.feed = feed
self.feed = self.feed.save()
return FEED_ERRPARSE, ret_values
return None, None
2022-03-02 10:55:46 -05:00
def compare_feed_attribute_changes(self):
"""
The feed has changed (or it is the first time we parse it)
saving the etag and last_modified fields
"""
if not self.feed:
logging.debug(f"Missing feed: {self.feed}")
return
2024-04-24 09:43:56 -04:00
original_etag = self.feed.etag
2024-04-24 09:43:56 -04:00
self.feed.etag = self.fpf.get("etag")
2010-09-17 13:24:23 -04:00
if self.feed.etag:
self.feed.etag = self.feed.etag[:255]
# some times this is None (it never should) *sigh*
if self.feed.etag is None:
2024-04-24 09:43:56 -04:00
self.feed.etag = ""
if self.feed.etag != original_etag:
2024-04-24 09:43:56 -04:00
self.feed.save(update_fields=["etag"])
2022-03-02 10:55:46 -05:00
original_last_modified = self.feed.last_modified
2024-04-24 09:43:56 -04:00
if hasattr(self.fpf, "modified") and self.fpf.modified:
2016-03-08 13:33:16 -08:00
try:
2022-03-02 10:55:46 -05:00
self.feed.last_modified = datetime.datetime.strptime(
2024-04-24 09:43:56 -04:00
self.fpf.modified, "%a, %d %b %Y %H:%M:%S %Z"
2022-03-02 10:55:46 -05:00
)
2020-06-13 13:13:20 -04:00
except Exception as e:
2016-03-08 13:33:16 -08:00
self.feed.last_modified = None
logging.debug("Broken mtime %s: %s" % (self.feed.last_modified, e))
pass
if self.feed.last_modified != original_last_modified:
2024-04-24 09:43:56 -04:00
self.feed.save(update_fields=["last_modified"])
2022-03-02 10:55:46 -05:00
original_title = self.feed.feed_title
2024-04-24 09:43:56 -04:00
if self.fpf.feed.get("title"):
self.feed.feed_title = strip_tags(self.fpf.feed.get("title"))
if self.feed.feed_title != original_title:
2024-04-24 09:43:56 -04:00
self.feed.save(update_fields=["feed_title"])
2022-03-02 10:55:46 -05:00
2024-04-24 09:43:56 -04:00
tagline = self.fpf.feed.get("tagline", self.feed.data.feed_tagline)
2011-02-05 22:27:47 -05:00
if tagline:
original_tagline = self.feed.data.feed_tagline
self.feed.data.feed_tagline = smart_str(tagline)
if self.feed.data.feed_tagline != original_tagline:
2024-04-24 09:43:56 -04:00
self.feed.data.save(update_fields=["feed_tagline"])
if not self.feed.feed_link_locked:
2024-04-24 09:43:56 -04:00
new_feed_link = self.fpf.feed.get("link") or self.fpf.feed.get("id") or self.feed.feed_link
if self.options["force"] and new_feed_link:
new_feed_link = qurl(new_feed_link, remove=["_"])
if new_feed_link != self.feed.feed_link:
2022-03-02 10:55:46 -05:00
logging.debug(
" ---> [%-30s] ~SB~FRFeed's page is different: %s to %s"
% (self.feed.log_title[:30], self.feed.feed_link, new_feed_link)
)
2024-04-24 09:43:56 -04:00
redirects, non_redirects = self.feed.count_redirects_in_history("page")
2022-03-02 10:55:46 -05:00
self.feed.save_page_history(301, "HTTP Redirect (%s to go)" % (10 - len(redirects)))
if len(redirects) >= 10 or len(non_redirects) == 0:
self.feed.feed_link = new_feed_link
2024-04-24 09:43:56 -04:00
self.feed.save(update_fields=["feed_link"])
2022-03-02 10:55:46 -05:00
def check_feed_for_push(self):
2024-04-24 09:43:56 -04:00
if not (hasattr(self.fpf, "feed") and hasattr(self.fpf.feed, "links") and self.fpf.feed.links):
return
2024-04-24 09:43:56 -04:00
hub_url = None
self_url = self.feed.feed_address
for link in self.fpf.feed.links:
2024-04-24 09:43:56 -04:00
if link["rel"] == "hub" and not hub_url:
hub_url = link["href"]
elif link["rel"] == "self":
self_url = link["href"]
2024-09-10 06:41:57 -07:00
if not hub_url and "youtube.com" in self_url:
hub_url = "https://pubsubhubbub.appspot.com/subscribe"
channel_id = self_url.split("channel_id=")
if len(channel_id) > 1:
self_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id[1]}"
push_expired = False
if self.feed.is_push:
try:
push_expired = self.feed.push.lease_expires < datetime.datetime.now()
except PushSubscription.DoesNotExist:
self.feed.is_push = False
if (
hub_url
and self_url
and not settings.DEBUG
and self.feed.active_subscribers > 0
2024-04-24 09:43:56 -04:00
and (push_expired or not self.feed.is_push or self.options.get("force"))
):
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s"
% (self.feed.log_title[:30], "~SKRe-~SN" if push_expired else "", hub_url)
2022-03-02 10:55:46 -05:00
)
try:
if settings.ENABLE_PUSH:
PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)
except TimeoutError:
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s"
% (self.feed.log_title[:30], hub_url)
2022-03-02 10:55:46 -05:00
)
elif self.feed.is_push and (self.feed.active_subscribers <= 0 or not hub_url):
2024-04-24 09:43:56 -04:00
logging.debug(" ---> [%-30s] ~BB~FWTurning off PuSH, no hub found" % (self.feed.log_title[:30]))
self.feed.is_push = False
self.feed = self.feed.save()
2022-03-02 10:55:46 -05:00
class FeedFetcherWorker:
def __init__(self, options):
self.options = options
self.feed_stats = {
2022-03-02 10:55:46 -05:00
FEED_OK: 0,
FEED_SAME: 0,
FEED_ERRPARSE: 0,
FEED_ERRHTTP: 0,
FEED_ERREXC: 0,
}
self.feed_trans = {
2024-04-24 09:43:56 -04:00
FEED_OK: "ok",
FEED_SAME: "unchanged",
FEED_ERRPARSE: "cant_parse",
FEED_ERRHTTP: "http_error",
FEED_ERREXC: "exception",
2022-03-02 10:55:46 -05:00
}
self.feed_keys = sorted(self.feed_trans.keys())
self.time_start = datetime.datetime.utcnow()
2010-10-06 22:43:05 -04:00
def refresh_feed(self, feed_id):
"""Update feed, since it may have changed"""
2016-05-20 16:15:18 -07:00
return Feed.get_by_id(feed_id)
2022-03-02 10:55:46 -05:00
def reset_database_connections(self):
connection._connections = {}
2022-03-02 10:55:46 -05:00
connection._connection_settings = {}
connection._dbs = {}
settings.MONGODB = connect(settings.MONGO_DB_NAME, **settings.MONGO_DB)
2024-04-24 09:43:56 -04:00
if "username" in settings.MONGO_ANALYTICS_DB:
2022-03-02 10:55:46 -05:00
settings.MONGOANALYTICSDB = connect(
2024-04-24 09:43:56 -04:00
db=settings.MONGO_ANALYTICS_DB["name"],
2022-03-02 10:55:46 -05:00
host=f"mongodb://{settings.MONGO_ANALYTICS_DB['username']}:{settings.MONGO_ANALYTICS_DB['password']}@{settings.MONGO_ANALYTICS_DB['host']}/?authSource=admin",
alias="nbanalytics",
)
else:
2022-03-02 10:55:46 -05:00
settings.MONGOANALYTICSDB = connect(
2024-04-24 09:43:56 -04:00
db=settings.MONGO_ANALYTICS_DB["name"],
2022-03-02 10:55:46 -05:00
host=f"mongodb://{settings.MONGO_ANALYTICS_DB['host']}/",
alias="nbanalytics",
)
2021-08-03 21:16:16 -04:00
def process_feed_wrapper(self, feed_queue):
self.reset_database_connections()
2022-03-02 10:55:46 -05:00
delta = None
current_process = multiprocessing.current_process()
identity = "X"
feed = None
2022-03-02 10:55:46 -05:00
if current_process._identity:
identity = current_process._identity[0]
# If fetching archive pages, come back once the archive scaffolding is built
2024-04-24 09:43:56 -04:00
if self.options.get("archive_page", None):
for feed_id in feed_queue:
feed = self.refresh_feed(feed_id)
try:
self.fetch_and_process_archive_pages(feed_id)
except SoftTimeLimitExceeded:
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FRTime limit reached while fetching ~FGarchive pages~FR. Made it to ~SB%s"
% (feed.log_title[:30], self.options["archive_page"])
)
pass
if len(feed_queue) == 1:
feed = self.refresh_feed(feed_queue[0])
return feed
return
for feed_id in feed_queue:
start_duration = time.time()
feed_fetch_duration = None
feed_process_duration = None
page_duration = None
icon_duration = None
feed_code = None
Merge branch 'master' into dynamodb * master: (23 commits) Fixing trainer for stories in social feeds. Fixing small bug in story titles header now showing socialsubs correctly. Fixing headers for Everything river and river blurblog. Massive refactor around feedbar, brought on by @afita, in order to give Folders a menu in the story titles header (and then make the mark as read button work on folders in the header.) Updating story position in closed story titles when pane is re-opened. Thanks to @afita for the bug. Updating to altest jQuery layout. Adding folder story counts to content pane. Thanks to @afita for the suggestion. Fixing activities bug when the user is unknown and there is no source/with_user. Calculating correct page fetch timing offset (if it was a 304). Adding server name and feed code to analytics. Turning analytics logging back on. Temporarily commenting out analytics to see if they cure the slow down. Shortening names on analytics for improved space saving. Adding new analytics collections to keep track of page loads and feed fetches. Need to start aggregating before I can tell how useful this data will be. Adding new analytics collections to keep track of page loads and feed fetches. Need to start aggregating before I can tell how useful this data will be. Adding VentureBeat article to press page. Adding order/read_filter to api. Also adding river blurblog. Fixing right click on folders. Fixing bug where publisher-disabled original pages were not properly disabled. Fixing bug where publisher-disabled original pages were not properly disabled. ... Conflicts: utils/feed_fetcher.py
2012-09-07 22:26:19 -07:00
ret_entries = None
2012-01-04 18:47:40 -08:00
start_time = time.time()
ret_feed = FEED_ERREXC
set_user({"id": feed_id})
try:
feed = self.refresh_feed(feed_id)
set_user({"id": feed_id, "username": feed.feed_title})
2022-03-02 10:55:46 -05:00
skip = False
2024-04-24 09:43:56 -04:00
if self.options.get("fake"):
skip = True
weight = "-"
quick = "-"
rand = "-"
2022-03-02 10:55:46 -05:00
elif (
2024-04-24 09:43:56 -04:00
self.options.get("quick")
and not self.options["force"]
2022-03-02 10:55:46 -05:00
and feed.known_good
and feed.fetched_once
and not feed.is_push
):
weight = feed.stories_last_month * feed.num_subscribers
random_weight = random.randint(1, max(weight, 1))
2024-04-24 09:43:56 -04:00
quick = float(self.options.get("quick", 0))
rand = random.random()
2016-11-09 09:00:30 -08:00
if random_weight < 1000 and rand < quick:
skip = True
elif False and feed.feed_address.startswith("http://news.google.com/news"):
skip = True
weight = "-"
quick = "-"
rand = "-"
if skip:
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~BGFaking fetch, skipping (%s/month, %s subs, %s < %s)..."
2022-03-02 10:55:46 -05:00
% (feed.log_title[:30], weight, feed.num_subscribers, rand, quick)
)
continue
2022-03-02 10:55:46 -05:00
ffeed = FetchFeed(feed_id, self.options)
ret_feed, fetched_feed = ffeed.fetch()
2017-05-22 16:46:56 -07:00
feed_fetch_duration = time.time() - start_duration
raw_feed = ffeed.raw_feed
2022-03-02 10:55:46 -05:00
2024-04-24 09:43:56 -04:00
if fetched_feed and (ret_feed == FEED_OK or self.options["force"]):
pfeed = ProcessFeed(feed_id, fetched_feed, self.options, raw_feed=raw_feed)
ret_feed, ret_entries = pfeed.process()
feed = pfeed.feed
feed_process_duration = time.time() - start_duration
2022-03-02 10:55:46 -05:00
2024-04-24 09:43:56 -04:00
if (ret_entries and ret_entries["new"]) or self.options["force"]:
start = time.time()
if not feed.known_good or not feed.fetched_once:
feed.known_good = True
feed.fetched_once = True
feed = feed.save()
2024-04-24 09:43:56 -04:00
if self.options["force"] or random.random() <= 0.02:
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FBPerforming feed cleanup..." % (feed.log_title[:30],)
2022-03-02 10:55:46 -05:00
)
start_cleanup = time.time()
feed.count_fs_size_bytes()
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FBDone with feed cleanup. Took ~SB%.4s~SN sec."
2022-03-02 10:55:46 -05:00
% (feed.log_title[:30], time.time() - start_cleanup)
)
try:
self.count_unreads_for_subscribers(feed)
except TimeoutError:
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] Unread count took too long..." % (feed.log_title[:30],)
2022-03-02 10:55:46 -05:00
)
2024-04-24 09:43:56 -04:00
if self.options["verbose"]:
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss"
2022-03-02 10:55:46 -05:00
% (feed.log_title[:30], time.time() - start)
)
except (urllib.error.HTTPError, urllib.error.URLError) as e:
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s" % (str(feed_id)[:30], e.reason)
2022-03-02 10:55:46 -05:00
)
feed_code = 404
feed.save_feed_history(feed_code, str(e.reason), e)
2010-08-23 07:58:09 -04:00
fetched_feed = None
2022-03-02 10:55:46 -05:00
except Feed.DoesNotExist:
2024-04-24 09:43:56 -04:00
logging.debug(" ---> [%-30s] ~FRFeed is now gone..." % (str(feed_id)[:30]))
continue
2020-06-13 13:13:20 -04:00
except SoftTimeLimitExceeded as e:
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
ret_feed = FEED_ERREXC
fetched_feed = None
2016-02-09 16:22:34 -08:00
feed_code = 559
2024-04-24 09:43:56 -04:00
feed.save_feed_history(feed_code, "Timeout", e)
2020-06-13 13:13:20 -04:00
except TimeoutError as e:
2024-04-24 09:43:56 -04:00
logging.debug(" ---> [%-30s] ~FRFeed fetch timed out..." % (feed.log_title[:30]))
feed_code = 505
2024-04-24 09:43:56 -04:00
feed.save_feed_history(feed_code, "Timeout", e)
fetched_feed = None
2020-06-13 13:13:20 -04:00
except Exception as e:
2024-04-24 09:43:56 -04:00
logging.debug("[%d] ! -------------------------" % (feed_id,))
2010-07-06 13:21:12 -04:00
tb = traceback.format_exc()
logging.error(tb)
2024-04-24 09:43:56 -04:00
logging.debug("[%d] ! -------------------------" % (feed_id,))
2022-03-02 10:55:46 -05:00
ret_feed = FEED_ERREXC
2024-04-24 09:43:56 -04:00
feed = Feed.get_by_id(getattr(feed, "pk", feed_id))
2022-03-02 10:55:46 -05:00
if not feed:
continue
feed.save_feed_history(500, "Error", tb)
feed_code = 500
2010-08-23 07:58:09 -04:00
fetched_feed = None
# mail_feed_error_to_admin(feed, e, local_vars=locals())
2024-04-24 09:43:56 -04:00
if not settings.DEBUG and hasattr(settings, "SENTRY_DSN") and settings.SENTRY_DSN:
2021-02-25 19:52:36 -05:00
capture_exception(e)
flush()
if not feed_code:
if ret_feed == FEED_OK:
feed_code = 200
elif ret_feed == FEED_SAME:
feed_code = 304
elif ret_feed == FEED_ERRHTTP:
feed_code = 400
if ret_feed == FEED_ERREXC:
feed_code = 500
elif ret_feed == FEED_ERRPARSE:
feed_code = 550
2022-03-02 10:55:46 -05:00
if not feed:
continue
feed = self.refresh_feed(feed.pk)
2022-03-02 10:55:46 -05:00
if not feed:
continue
if (
2024-04-24 09:43:56 -04:00
(self.options["force"])
2022-03-02 10:55:46 -05:00
or (random.random() > 0.9)
or (
fetched_feed
and feed.feed_link
and feed.has_page
and (ret_feed == FEED_OK or (ret_feed == FEED_SAME and feed.stories_last_month > 10))
)
):
2024-04-24 09:43:56 -04:00
logging.debug(" ---> [%-30s] ~FYFetching page: %s" % (feed.log_title[:30], feed.feed_link))
page_importer = PageImporter(feed)
try:
page_data = page_importer.fetch_page()
page_duration = time.time() - start_duration
2020-06-13 13:13:20 -04:00
except SoftTimeLimitExceeded as e:
2022-03-02 10:55:46 -05:00
logging.debug(
" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed
)
page_data = None
2024-04-24 09:43:56 -04:00
feed.save_feed_history(557, "Timeout", e)
2022-03-02 10:55:46 -05:00
except TimeoutError:
2024-04-24 09:43:56 -04:00
logging.debug(" ---> [%-30s] ~FRPage fetch timed out..." % (feed.log_title[:30]))
page_data = None
2024-04-24 09:43:56 -04:00
feed.save_page_history(555, "Timeout", "")
2020-06-13 13:13:20 -04:00
except Exception as e:
2024-04-24 09:43:56 -04:00
logging.debug("[%d] ! -------------------------" % (feed_id,))
tb = traceback.format_exc()
logging.error(tb)
2024-04-24 09:43:56 -04:00
logging.debug("[%d] ! -------------------------" % (feed_id,))
feed.save_page_history(550, "Page Error", tb)
fetched_feed = None
page_data = None
# mail_feed_error_to_admin(feed, e, local_vars=locals())
2024-04-24 09:43:56 -04:00
if not settings.DEBUG and hasattr(settings, "SENTRY_DSN") and settings.SENTRY_DSN:
2021-02-25 19:52:36 -05:00
capture_exception(e)
flush()
2022-03-02 10:55:46 -05:00
feed = self.refresh_feed(feed.pk)
2024-04-24 09:43:56 -04:00
logging.debug(" ---> [%-30s] ~FYFetching icon: %s" % (feed.log_title[:30], feed.feed_link))
force = self.options["force"]
2022-03-02 10:55:46 -05:00
if random.random() > 0.99:
2013-09-06 12:42:39 -07:00
force = True
icon_importer = IconImporter(feed, page_data=page_data, force=force)
try:
icon_importer.save()
icon_duration = time.time() - start_duration
2020-06-13 13:13:20 -04:00
except SoftTimeLimitExceeded as e:
2022-03-02 10:55:46 -05:00
logging.debug(
" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed
)
2024-04-24 09:43:56 -04:00
feed.save_feed_history(558, "Timeout", e)
2022-03-02 10:55:46 -05:00
except TimeoutError:
2024-04-24 09:43:56 -04:00
logging.debug(" ---> [%-30s] ~FRIcon fetch timed out..." % (feed.log_title[:30]))
feed.save_page_history(556, "Timeout", "")
2020-06-13 13:13:20 -04:00
except Exception as e:
2024-04-24 09:43:56 -04:00
logging.debug("[%d] ! -------------------------" % (feed_id,))
tb = traceback.format_exc()
logging.error(tb)
2024-04-24 09:43:56 -04:00
logging.debug("[%d] ! -------------------------" % (feed_id,))
# feed.save_feed_history(560, "Icon Error", tb)
# mail_feed_error_to_admin(feed, e, local_vars=locals())
2024-04-24 09:43:56 -04:00
if not settings.DEBUG and hasattr(settings, "SENTRY_DSN") and settings.SENTRY_DSN:
2021-02-25 19:52:36 -05:00
capture_exception(e)
flush()
else:
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FBSkipping page fetch: (%s on %s stories) %s"
2022-03-02 10:55:46 -05:00
% (
feed.log_title[:30],
self.feed_trans[ret_feed],
feed.stories_last_month,
2024-04-24 09:43:56 -04:00
"" if feed.has_page else " [HAS NO PAGE]",
2022-03-02 10:55:46 -05:00
)
)
feed = self.refresh_feed(feed.pk)
2012-01-04 18:47:40 -08:00
delta = time.time() - start_time
2022-03-02 10:55:46 -05:00
2012-01-04 18:47:40 -08:00
feed.last_load_time = round(delta)
feed.fetched_once = True
try:
2024-04-24 09:43:56 -04:00
feed = feed.save(update_fields=["last_load_time", "fetched_once"])
except IntegrityError:
2022-03-02 10:55:46 -05:00
logging.debug(
" ***> [%-30s] ~FRIntegrityError on feed: %s"
% (
feed.log_title[:30],
feed.feed_address,
)
)
2024-04-24 09:43:56 -04:00
if ret_entries and ret_entries["new"]:
self.publish_to_subscribers(feed, ret_entries["new"])
2022-03-02 10:55:46 -05:00
2024-04-24 09:43:56 -04:00
done_msg = "%2s ---> [%-30s] ~FYProcessed in ~FM~SB%.4ss~FY~SN (~FB%s~FY) [%s]" % (
2022-03-02 10:55:46 -05:00
identity,
feed.log_title[:30],
delta,
feed.pk,
self.feed_trans[ret_feed],
)
2010-04-06 16:56:47 -04:00
logging.debug(done_msg)
total_duration = time.time() - start_duration
2022-03-02 10:55:46 -05:00
MAnalyticsFetcher.add(
feed_id=feed.pk,
feed_fetch=feed_fetch_duration,
feed_process=feed_process_duration,
page=page_duration,
icon=icon_duration,
total=total_duration,
feed_code=feed_code,
)
self.feed_stats[ret_feed] += 1
2022-03-02 10:55:46 -05:00
if len(feed_queue) == 1:
return feed
2022-03-02 10:55:46 -05:00
# time_taken = datetime.datetime.utcnow() - self.time_start
2022-03-02 10:55:46 -05:00
2022-05-24 11:40:49 -04:00
def fetch_and_process_archive_pages(self, feed_id):
feed = Feed.get_by_id(feed_id)
first_seen_feed = None
2024-04-24 09:43:56 -04:00
original_starting_page = self.options["archive_page"]
for archive_page_key in ["page", "paged", "rfc5005"]:
seen_story_hashes = set()
failed_pages = 0
2024-04-24 09:43:56 -04:00
self.options["archive_page_key"] = archive_page_key
if archive_page_key == "rfc5005":
2024-04-24 09:43:56 -04:00
self.options["archive_page"] = "rfc5005"
link_prev_archive = None
if first_seen_feed:
2024-04-24 09:43:56 -04:00
for link in getattr(first_seen_feed.feed, "links", []):
if link["rel"] == "prev-archive" or link["rel"] == "next":
link_prev_archive = link["href"]
logging.debug(
" ---> [%-30s] ~FGFeed has ~SBRFC5005~SN links, filling out archive: %s"
% (feed.log_title[:30], link_prev_archive)
)
break
else:
2024-04-24 09:43:56 -04:00
logging.debug(
" ---> [%-30s] ~FBFeed has no RFC5005 links..." % (feed.log_title[:30])
)
else:
2024-04-24 09:43:56 -04:00
self.options["archive_page_link"] = link_prev_archive
ffeed = FetchFeed(feed_id, self.options)
try:
ret_feed, fetched_feed = ffeed.fetch()
except TimeoutError:
2024-04-24 09:43:56 -04:00
logging.debug(
" ---> [%-30s] ~FRArchive feed fetch timed out..." % (feed.log_title[:30])
)
# Timeout means don't bother to keep checking...
continue
raw_feed = ffeed.raw_feed
if fetched_feed and ret_feed == FEED_OK:
pfeed = ProcessFeed(feed_id, fetched_feed, self.options, raw_feed=raw_feed)
if not pfeed.fpf or not pfeed.fpf.entries:
continue
2024-04-24 09:43:56 -04:00
for link in getattr(pfeed.fpf.feed, "links", []):
if link["rel"] == "prev-archive" or link["rel"] == "next":
link_prev_archive = link["href"]
if not link_prev_archive:
continue
while True:
if not link_prev_archive:
break
2024-04-24 09:43:56 -04:00
if link_prev_archive == self.options.get("archive_page_link", None):
logging.debug(
" ---> [%-30s] ~FRNo change in archive page link: %s"
% (feed.log_title[:30], link_prev_archive)
)
break
self.options["archive_page_link"] = link_prev_archive
link_prev_archive = None
ffeed = FetchFeed(feed_id, self.options)
try:
ret_feed, fetched_feed = ffeed.fetch()
except TimeoutError as e:
2024-04-24 09:43:56 -04:00
logging.debug(
" ---> [%-30s] ~FRArchive feed fetch timed out..." % (feed.log_title[:30])
)
# Timeout means don't bother to keep checking...
break
raw_feed = ffeed.raw_feed
if fetched_feed and ret_feed == FEED_OK:
pfeed = ProcessFeed(feed_id, fetched_feed, self.options, raw_feed=raw_feed)
if not pfeed.fpf or not pfeed.fpf.entries:
2024-04-24 09:43:56 -04:00
logging.debug(
" ---> [%-30s] ~FRFeed parse failed, no entries" % (feed.log_title[:30])
)
continue
2024-04-24 09:43:56 -04:00
for link in getattr(pfeed.fpf.feed, "links", []):
if link["rel"] == "prev-archive" or link["rel"] == "next":
link_prev_archive = link["href"]
logging.debug(
" ---> [%-30s] ~FGFeed still has ~SBRFC5005~SN links, continuing filling out archive: %s"
% (feed.log_title[:30], link_prev_archive)
)
break
else:
2024-04-24 09:43:56 -04:00
logging.debug(
" ---> [%-30s] ~FBFeed has no more RFC5005 links..." % (feed.log_title[:30])
)
break
before_story_hashes = len(seen_story_hashes)
pfeed.process()
seen_story_hashes.update(pfeed.archive_seen_story_hashes)
after_story_hashes = len(seen_story_hashes)
if before_story_hashes == after_story_hashes:
2024-04-24 09:43:56 -04:00
logging.debug(
" ---> [%-30s] ~FRNo change in story hashes, but has archive link: %s"
% (feed.log_title[:30], link_prev_archive)
)
failed_color = "~FR" if not link_prev_archive else ""
2024-04-24 09:43:56 -04:00
logging.debug(
f" ---> [{feed.log_title[:30]:<30}] ~FGStory hashes found, archive RFC5005 ~SB{link_prev_archive}~SN: ~SB~FG{failed_color}{len(seen_story_hashes):,} stories~SN~FB"
)
else:
2022-05-24 11:36:16 -04:00
for page in range(3 if settings.DEBUG and False else 150):
2022-06-22 14:49:31 -04:00
if page < original_starting_page:
continue
2024-04-24 09:43:56 -04:00
if failed_pages >= 1:
break
2024-04-24 09:43:56 -04:00
self.options["archive_page"] = page + 1
ffeed = FetchFeed(feed_id, self.options)
try:
ret_feed, fetched_feed = ffeed.fetch()
except TimeoutError as e:
2024-04-24 09:43:56 -04:00
logging.debug(
" ---> [%-30s] ~FRArchive feed fetch timed out..." % (feed.log_title[:30])
)
# Timeout means don't bother to keep checking...
break
raw_feed = ffeed.raw_feed
if fetched_feed and ret_feed == FEED_OK:
pfeed = ProcessFeed(feed_id, fetched_feed, self.options, raw_feed=raw_feed)
if not pfeed.fpf or not pfeed.fpf.entries:
failed_pages += 1
continue
if not first_seen_feed:
first_seen_feed = pfeed.fpf
before_story_hashes = len(seen_story_hashes)
pfeed.process()
seen_story_hashes.update(pfeed.archive_seen_story_hashes)
after_story_hashes = len(seen_story_hashes)
if before_story_hashes == after_story_hashes:
failed_pages += 1
else:
failed_pages += 1
failed_color = "~FR" if failed_pages > 0 else ""
2024-04-24 09:43:56 -04:00
logging.debug(
f" ---> [{feed.log_title[:30]:<30}] ~FGStory hashes found, archive page ~SB{page+1}~SN: ~SB~FG{len(seen_story_hashes):,} stories~SN~FB, {failed_color}{failed_pages} failures"
)
def publish_to_subscribers(self, feed, new_count):
try:
r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL)
2024-04-24 09:43:56 -04:00
listeners_count = r.publish(str(feed.pk), "story:new_count:%s" % new_count)
2011-11-06 12:28:06 -08:00
if listeners_count:
2022-03-02 10:55:46 -05:00
logging.debug(
" ---> [%-30s] ~FMPublished to %s subscribers" % (feed.log_title[:30], listeners_count)
)
except redis.ConnectionError:
logging.debug(" ***> [%-30s] ~BMRedis is unavailable for real-time." % (feed.log_title[:30],))
2022-03-02 10:55:46 -05:00
def count_unreads_for_subscribers(self, feed):
subscriber_expire = datetime.datetime.now() - datetime.timedelta(days=settings.SUBSCRIBER_EXPIRE)
2022-03-02 10:55:46 -05:00
user_subs = UserSubscription.objects.filter(
feed=feed, active=True, user__profile__last_seen_on__gte=subscriber_expire
2024-04-24 09:43:56 -04:00
).order_by("-last_read_date")
2022-03-02 10:55:46 -05:00
if not user_subs.count():
return
2022-03-02 10:55:46 -05:00
for sub in user_subs:
if not sub.needs_unread_recalc:
sub.needs_unread_recalc = True
sub.save()
2024-04-24 09:43:56 -04:00
if self.options["compute_scores"]:
r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL)
2022-03-02 10:55:46 -05:00
stories = MStory.objects(story_feed_id=feed.pk, story_date__gte=feed.unread_cutoff)
stories = Feed.format_stories(stories, feed.pk)
2022-03-02 10:55:46 -05:00
story_hashes = r.zrangebyscore(
2024-04-24 09:43:56 -04:00
"zF:%s" % feed.pk,
int(feed.unread_cutoff.strftime("%s")),
2022-03-02 10:55:46 -05:00
int(time.time() + 60 * 60 * 24),
)
2024-04-24 09:43:56 -04:00
missing_story_hashes = set(story_hashes) - set([s["story_hash"] for s in stories])
if missing_story_hashes:
2022-03-02 10:55:46 -05:00
missing_stories = MStory.objects(
story_feed_id=feed.pk, story_hash__in=missing_story_hashes
).read_preference(pymongo.ReadPreference.PRIMARY)
missing_stories = Feed.format_stories(missing_stories, feed.pk)
stories = missing_stories + stories
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FYFound ~SB~FC%s(of %s)/%s~FY~SN un-secondaried stories while computing scores"
2022-03-02 10:55:46 -05:00
% (
feed.log_title[:30],
len(missing_stories),
len(missing_story_hashes),
len(stories),
)
)
cache.set("S:v3:%s" % feed.pk, stories, 60)
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)"
2022-03-02 10:55:46 -05:00
% (
feed.log_title[:30],
len(stories),
user_subs.count(),
feed.num_subscribers,
feed.active_subscribers,
feed.premium_subscribers,
)
)
self.calculate_feed_scores_with_stories(user_subs, stories)
2024-04-24 09:43:56 -04:00
elif self.options.get("mongodb_replication_lag"):
2022-03-02 10:55:46 -05:00
logging.debug(
2024-04-24 09:43:56 -04:00
" ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag"
% (feed.log_title[:30], self.options.get("mongodb_replication_lag"))
2022-03-02 10:55:46 -05:00
)
@timelimit(10)
def calculate_feed_scores_with_stories(self, user_subs, stories):
for sub in user_subs:
2024-04-24 09:43:56 -04:00
silent = False if getattr(self.options, "verbose", 0) >= 2 else True
sub.calculate_feed_scores(silent=silent, stories=stories)
2022-03-02 10:55:46 -05:00
class Dispatcher:
def __init__(self, options, num_threads):
self.options = options
self.num_threads = num_threads
self.workers = []
def add_jobs(self, feeds_queue, feeds_count=1):
2022-03-02 10:55:46 -05:00
"""adds a feed processing job to the pool"""
self.feeds_queue = feeds_queue
self.feeds_count = feeds_count
2022-03-02 10:55:46 -05:00
def run_jobs(self):
2024-04-24 09:43:56 -04:00
if self.options["single_threaded"] or self.num_threads == 1:
return dispatch_workers(self.feeds_queue[0], self.options)
else:
for i in range(self.num_threads):
feed_queue = self.feeds_queue[i]
2022-03-02 10:55:46 -05:00
self.workers.append(
multiprocessing.Process(target=dispatch_workers, args=(feed_queue, self.options))
)
for i in range(self.num_threads):
self.workers[i].start()
2022-03-02 10:55:46 -05:00
def dispatch_workers(feed_queue, options):
worker = FeedFetcherWorker(options)
return worker.process_feed_wrapper(feed_queue)