NewsBlur-viq/utils/feed_fetcher.py

import datetime
import multiprocessing
import time
import traceback

import django

django.setup()

import http
import http.client
import urllib.error
import urllib.parse
import urllib.request

http.client._MAXHEADERS = 10000

import random
import re
import xml.sax

import feedparser
import pymongo
import redis
import requests
from django.conf import settings
from django.core.cache import cache
from django.db import IntegrityError
from sentry_sdk import set_user

from apps.notifications.models import MUserFeedNotification
from apps.notifications.tasks import QueueNotifications
from apps.push.models import PushSubscription
from apps.reader.models import UserSubscription
from apps.rss_feeds.icon_importer import IconImporter
from apps.rss_feeds.models import Feed, MStory
from apps.rss_feeds.page_importer import PageImporter
from apps.statistics.models import MAnalyticsFetcher, MStatistics

feedparser.sanitizer._HTMLSanitizer.acceptable_elements.update(['iframe'])
feedparser.sanitizer._HTMLSanitizer.acceptable_elements.update(['text'])

from bs4 import BeautifulSoup
from celery.exceptions import SoftTimeLimitExceeded
from django.utils import feedgenerator
from django.utils.encoding import smart_str
from django.utils.html import linebreaks
from mongoengine import connect, connection
from qurl import qurl
from sentry_sdk import capture_exception, flush

from utils import json_functions as json
from utils import log as logging
from utils.facebook_fetcher import FacebookFetcher
from utils.feed_functions import TimeoutError, timelimit
from utils.json_fetcher import JSONFetcher
from utils.story_functions import linkify, pre_process_story, strip_tags
from utils.twitter_fetcher import TwitterFetcher
from utils.youtube_fetcher import YoutubeFetcher

# from utils.feed_functions import mail_feed_error_to_admin


# Refresh feed code adapted from Feedjack.
# http://feedjack.googlecode.com

FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = list(range(5))


class FetchFeed:
    def __init__(self, feed_id, options):
        self.feed = Feed.get_by_id(feed_id)
        self.options = options
        self.fpf = None
        self.raw_feed = None

    @timelimit(45)
    def fetch(self):
        """
        Uses requests to download the feed, parsing it in feedparser. Will be storified later.
        """
        start = time.time()
        identity = self.get_identity()
        if self.options.get('archive_page', None):
            log_msg = '%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY) ~BG~FMarchive page~ST~FY: ~SB%s' % (
                identity,
                self.feed.log_title[:30],
                self.feed.id,
                self.options['archive_page'],
            )
        else:
            log_msg = '%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (
                identity,
                self.feed.log_title[:30],
                self.feed.id,
                datetime.datetime.now() - self.feed.last_update,
            )
        logging.debug(log_msg)

        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
        address = self.feed.feed_address

        if self.options.get('force') or self.options.get('archive_page', None) or random.random() <= 0.01:
            self.options['force'] = True
            modified = None
            etag = None
            if self.options.get('archive_page', None) == "rfc5005" and self.options.get('archive_page_link', None):
                address = self.options['archive_page_link']
            elif self.options.get('archive_page', None):
                address = qurl(address, add={self.options['archive_page_key']: self.options['archive_page']})
            elif address.startswith('http'):
                address = qurl(address, add={"_": random.randint(0, 10000)})
            logging.debug('   ---> [%-30s] ~FBForcing fetch: %s' % (self.feed.log_title[:30], address))
        elif not self.feed.fetched_once or not self.feed.known_good:
            modified = None
            etag = None

        if self.options.get('feed_xml'):
            logging.debug(
                '   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s'
                % (self.feed.log_title[:30], len(self.options.get('feed_xml')))
            )

        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(
                '   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (self.feed.log_title[:30])
            )
            return FEED_OK, self.fpf

        if 'youtube.com' in address:
            youtube_feed = self.fetch_youtube()
            if not youtube_feed:
                logging.debug(
                    '   ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.log_title[:30], address)
                )
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(youtube_feed, sanitize_html=False)
        elif re.match(r'(https?)?://twitter.com/\w+/?', qurl(address, remove=['_'])):
            twitter_feed = self.fetch_twitter(address)
            if not twitter_feed:
                logging.debug(
                    '   ***> [%-30s] ~FRTwitter fetch failed: %s' % (self.feed.log_title[:30], address)
                )
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(twitter_feed)
        elif re.match(r'(.*?)facebook.com/\w+/?$', qurl(address, remove=['_'])):
            facebook_feed = self.fetch_facebook()
            if not facebook_feed:
                logging.debug(
                    '   ***> [%-30s] ~FRFacebook fetch failed: %s' % (self.feed.log_title[:30], address)
                )
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(facebook_feed)

        if not self.fpf and 'json' in address:
            try:
                headers = self.feed.fetch_headers()
                if etag:
                    headers['If-None-Match'] = etag
                if modified:
                    # format into an RFC 1123-compliant timestamp. We can't use
                    # time.strftime() since the %a and %b directives can be affected
                    # by the current locale, but RFC 2616 states that dates must be
                    # in English.
                    short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
                    months = [
                        'Jan',
                        'Feb',
                        'Mar',
                        'Apr',
                        'May',
                        'Jun',
                        'Jul',
                        'Aug',
                        'Sep',
                        'Oct',
                        'Nov',
                        'Dec',
                    ]
                    modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (
                        short_weekdays[modified[6]],
                        modified[2],
                        months[modified[1] - 1],
                        modified[0],
                        modified[3],
                        modified[4],
                        modified[5],
                    )
                    headers['If-Modified-Since'] = modified_header
                if etag or modified:
                    headers['A-IM'] = 'feed'
                try:
                    raw_feed = requests.get(address, headers=headers, timeout=15)
                except (requests.adapters.ConnectionError, TimeoutError):
                    raw_feed = None
                if not raw_feed or raw_feed.status_code >= 400:
                    if raw_feed:
                        logging.debug(
                            "   ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s"
                            % (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers)
                        )
                    else:
                        logging.debug("   ***> [%-30s] ~FRJson feed fetch timed out, trying fake headers: %s" % (self.feed.log_title[:30], address))
                    raw_feed = requests.get(
                        self.feed.feed_address,
                        headers=self.feed.fetch_headers(fake=True),
                        timeout=15,
                    )

                json_feed_content_type = any(
                    json_feed in raw_feed.headers.get('Content-Type', "")
                    for json_feed in ['application/feed+json', 'application/json']
                )
                if raw_feed.content and json_feed_content_type:
                    # JSON Feed
                    json_feed = self.fetch_json_feed(address, raw_feed)
                    if not json_feed:
                        logging.debug(
                            '   ***> [%-30s] ~FRJSON fetch failed: %s' % (self.feed.log_title[:30], address)
                        )
                        return FEED_ERRHTTP, None
                    self.fpf = feedparser.parse(json_feed)
                elif raw_feed.content and raw_feed.status_code < 400:
                    response_headers = raw_feed.headers
                    response_headers['Content-Location'] = raw_feed.url
                    self.raw_feed = smart_str(raw_feed.content)
                    self.fpf = feedparser.parse(self.raw_feed, response_headers=response_headers)
                    if self.options['verbose']:
                        logging.debug(
                            " ---> [%-30s] ~FBFeed fetch status %s: %s length / %s"
                            % (
                                self.feed.log_title[:30],
                                raw_feed.status_code,
                                len(smart_str(raw_feed.content)),
                                raw_feed.headers,
                            )
                        )
            except Exception as e:
                logging.debug(
                    "   ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s"
                    % (self.feed.log_title[:30], str(e))
                )
                # raise e

        if not self.fpf or self.options.get('force_fp', False):
            try:
                self.fpf = feedparser.parse(address, agent=self.feed.user_agent, etag=etag, modified=modified)
            except (
                TypeError,
                ValueError,
                KeyError,
                EOFError,
                MemoryError,
                urllib.error.URLError,
                http.client.InvalidURL,
                http.client.BadStatusLine,
                http.client.IncompleteRead,
                ConnectionResetError,
                TimeoutError,
            ) as e:
                logging.debug('   ***> [%-30s] ~FRFeed fetch error: %s' % (self.feed.log_title[:30], e))
                pass

        if not self.fpf:
            try:
                logging.debug('   ***> [%-30s] ~FRTurning off headers: %s' % (self.feed.log_title[:30], address))
                self.fpf = feedparser.parse(address, agent=self.feed.user_agent)
            except (
                TypeError,
                ValueError,
                KeyError,
                EOFError,
                MemoryError,
                urllib.error.URLError,
                http.client.InvalidURL,
                http.client.BadStatusLine,
                http.client.IncompleteRead,
                ConnectionResetError,
            ) as e:
                logging.debug('   ***> [%-30s] ~FRFetch failed: %s.' % (self.feed.log_title[:30], e))
                return FEED_ERRHTTP, None

        logging.debug(
            '   ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (self.feed.log_title[:30], time.time() - start)
        )

        return FEED_OK, self.fpf

    def get_identity(self):
        identity = "X"

        current_process = multiprocessing.current_process()
        if current_process._identity:
            identity = current_process._identity[0]

        return identity

    def fetch_twitter(self, address=None):
        twitter_fetcher = TwitterFetcher(self.feed, self.options)
        return twitter_fetcher.fetch(address)

    def fetch_facebook(self):
        facebook_fetcher = FacebookFetcher(self.feed, self.options)
        return facebook_fetcher.fetch()

    def fetch_json_feed(self, address, headers):
        json_fetcher = JSONFetcher(self.feed, self.options)
        return json_fetcher.fetch(address, headers)

    def fetch_youtube(self):
        youtube_fetcher = YoutubeFetcher(self.feed, self.options)
        return youtube_fetcher.fetch()


class ProcessFeed:
    def __init__(self, feed_id, fpf, options, raw_feed=None):
        self.feed_id = feed_id
        self.options = options
        self.fpf = fpf
        self.raw_feed = raw_feed
        self.archive_seen_story_hashes = set()

    def refresh_feed(self):
        self.feed = Feed.get_by_id(self.feed_id)
        if self.feed_id != self.feed.pk:
            logging.debug(" ***> Feed has changed: from %s to %s" % (self.feed_id, self.feed.pk))
            self.feed_id = self.feed.pk

    def process(self):
        """Downloads and parses a feed."""
        start = time.time()
        self.refresh_feed()

        if not self.options.get('archive_page', None):
            feed_status, ret_values = self.verify_feed_integrity()
            if feed_status and ret_values:
                return feed_status, ret_values

        self.fpf.entries = self.fpf.entries[:100]

        if not self.options.get('archive_page', None):
            self.compare_feed_attribute_changes()

        # Determine if stories aren't valid and replace broken guids
        guids_seen = set()
        permalinks_seen = set()
        for entry in self.fpf.entries:
            guids_seen.add(entry.get('guid'))
            permalinks_seen.add(Feed.get_permalink(entry))
        guid_difference = len(guids_seen) != len(self.fpf.entries)
        single_guid = len(guids_seen) == 1
        replace_guids = single_guid and guid_difference
        permalink_difference = len(permalinks_seen) != len(self.fpf.entries)
        single_permalink = len(permalinks_seen) == 1
        replace_permalinks = single_permalink and permalink_difference

        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        day_ago = datetime.datetime.now() - datetime.timedelta(days=1)
        story_hashes = []
        stories = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry, self.fpf.encoding)
            if not story['title'] and not story['story_content']:
                continue
            if self.options.get('archive_page', None) and story.get('published') > day_ago:
                # Archive only: Arbitrary but necessary to prevent feeds from creating an unlimited number of stories
                # because they don't have a guid so it gets auto-generated based on the date, and if the story
                # is missing a date, then the latest date gets used. So reject anything newer than 24 hours old
                # when filling out the archive.
                # logging.debug(f"   ---> [%-30s] ~FBTossing story because it's too new for the archive: ~SB{story}")
                continue
            if story.get('published') < start_date:
                start_date = story.get('published')
            if replace_guids:
                if replace_permalinks:
                    new_story_guid = str(story.get('published'))
                    if self.options['verbose']:
                        logging.debug(
                            '   ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s'
                            % (self.feed.log_title[:30], story.get('guid'), new_story_guid)
                        )
                    story['guid'] = new_story_guid
                else:
                    new_story_guid = Feed.get_permalink(story)
                    if self.options['verbose']:
                        logging.debug(
                            '   ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s'
                            % (self.feed.log_title[:30], story.get('guid'), new_story_guid)
                        )
                    story['guid'] = new_story_guid
            story['story_hash'] = MStory.feed_guid_hash_unsaved(self.feed.pk, story.get('guid'))
            stories.append(story)
            story_hashes.append(story.get('story_hash'))

        original_story_hash_count = len(story_hashes)
        story_hashes_in_unread_cutoff = self.feed.story_hashes_in_unread_cutoff[:original_story_hash_count]
        story_hashes.extend(story_hashes_in_unread_cutoff)
        story_hashes = list(set(story_hashes))
        if self.options['verbose'] or settings.DEBUG:
            logging.debug(
                '   ---> [%-30s] ~FBFound ~SB%s~SN guids, adding ~SB%s~SN/%s guids from db'
                % (
                    self.feed.log_title[:30],
                    original_story_hash_count,
                    len(story_hashes) - original_story_hash_count,
                    len(story_hashes_in_unread_cutoff),
                )
            )

        existing_stories = dict(
            (s.story_hash, s)
            for s in MStory.objects(
                story_hash__in=story_hashes,
                # story_date__gte=start_date,
                # story_feed_id=self.feed.pk
            )
        )
        # if len(existing_stories) == 0:
        #     existing_stories = dict((s.story_hash, s) for s in MStory.objects(
        #         story_date__gte=start_date,
        #         story_feed_id=self.feed.pk
        #     ))

        ret_values = self.feed.add_update_stories(
            stories,
            existing_stories,
            verbose=self.options['verbose'],
            updates_off=self.options['updates_off'],
        )

        # PubSubHubbub
        if not self.options.get('archive_page', None):
            self.check_feed_for_push()

        # Push notifications
        if ret_values['new'] > 0 and MUserFeedNotification.feed_has_users(self.feed.pk) > 0:
            QueueNotifications.delay(self.feed.pk, ret_values['new'])

        # All Done
        logging.debug(
            '   ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s'
            % (
                self.feed.log_title[:30],
                '~FG~SB' if ret_values['new'] else '',
                ret_values['new'],
                '~FY~SB' if ret_values['updated'] else '',
                ret_values['updated'],
                '~SB' if ret_values['same'] else '',
                ret_values['same'],
                '~FR~SB' if ret_values['error'] else '',
                ret_values['error'],
                len(self.fpf.entries),
            )
        )
        self.feed.update_all_statistics(has_new_stories=bool(ret_values['new']), force=self.options['force'])
        fetch_date = datetime.datetime.now()
        if ret_values['new']:
            if not getattr(settings, 'TEST_DEBUG', False):
                self.feed.trim_feed()
                self.feed.expire_redis()
            if MStatistics.get('raw_feed', None) == self.feed.pk:
                self.feed.save_raw_feed(self.raw_feed, fetch_date)
        self.feed.save_feed_history(200, "OK", date=fetch_date)

        if self.options['verbose']:
            logging.debug(
                '   ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss'
                % (self.feed.log_title[:30], time.time() - start)
            )

        if self.options.get('archive_page', None):
            self.archive_seen_story_hashes.update(story_hashes)

        return FEED_OK, ret_values

    def verify_feed_integrity(self):
        """Ensures stories come through and any abberant status codes get saved

        Returns:
            FEED_STATUS: enum
            ret_values: dictionary of counts of new, updated, same, and error stories
        """
        ret_values = dict(new=0, updated=0, same=0, error=0)

        if not self.feed:
            return FEED_ERREXC, ret_values

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(
                        '   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)'
                        % (self.feed.log_title[:30], self.fpf.bozo_exception, len(self.fpf.entries))
                    )

            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values

            # 302 and 307: Temporary redirect: ignore
            # 301 and 308: Permanent redirect: save it (after 10 tries)
            if self.fpf.status == 301 or self.fpf.status == 308:
                if self.fpf.href.endswith('feedburner.com/atom.xml'):
                    return FEED_ERRHTTP, ret_values
                redirects, non_redirects = self.feed.count_redirects_in_history('feed')
                self.feed.save_feed_history(
                    self.fpf.status, "HTTP Redirect (%d to go)" % (10 - len(redirects))
                )
                if len(redirects) >= 10 or len(non_redirects) == 0:
                    address = self.fpf.href
                    if self.options['force'] and address:
                        address = qurl(address, remove=['_'])
                    self.feed.feed_address = address
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug(
                        "   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..."
                        % (self.feed.log_title[:30], self.fpf.status)
                    )
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug(
                    "   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
                    % (self.feed.log_title[:30], self.fpf.status)
                )
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values

        if not self.fpf:
            logging.debug(
                "   ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!"
                % (self.feed.log_title[:30])
            )
            self.feed.save_feed_history(551, "Broken feed")
            return FEED_ERRHTTP, ret_values

        if self.fpf and not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..."
                    % (self.feed.log_title[:30], len(self.fpf.entries))
                )
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..."
                    % (self.feed.log_title[:30], len(self.fpf.entries))
                )
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(553, 'Not an RSS feed', self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
        return None, None

    def compare_feed_attribute_changes(self):
        """
        The feed has changed (or it is the first time we parse it)
        saving the etag and last_modified fields
        """
        if not self.feed:
            logging.debug(f"Missing feed: {self.feed}")
            return

        original_etag = self.feed.etag
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''
        if self.feed.etag != original_etag:
            self.feed.save(update_fields=['etag'])

        original_last_modified = self.feed.last_modified
        if hasattr(self.fpf, 'modified') and self.fpf.modified:
            try:
                self.feed.last_modified = datetime.datetime.strptime(
                    self.fpf.modified, '%a, %d %b %Y %H:%M:%S %Z'
                )
            except Exception as e:
                self.feed.last_modified = None
                logging.debug("Broken mtime %s: %s" % (self.feed.last_modified, e))
                pass
        if self.feed.last_modified != original_last_modified:
            self.feed.save(update_fields=['last_modified'])

        original_title = self.feed.feed_title
        if self.fpf.feed.get('title'):
            self.feed.feed_title = strip_tags(self.fpf.feed.get('title'))
        if self.feed.feed_title != original_title:
            self.feed.save(update_fields=['feed_title'])

        tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
        if tagline:
            original_tagline = self.feed.data.feed_tagline
            self.feed.data.feed_tagline = smart_str(tagline)
            if self.feed.data.feed_tagline != original_tagline:
                self.feed.data.save(update_fields=['feed_tagline'])

        if not self.feed.feed_link_locked:
            new_feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
            if self.options['force'] and new_feed_link:
                new_feed_link = qurl(new_feed_link, remove=['_'])
            if new_feed_link != self.feed.feed_link:
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed's page is different: %s to %s"
                    % (self.feed.log_title[:30], self.feed.feed_link, new_feed_link)
                )
                redirects, non_redirects = self.feed.count_redirects_in_history('page')
                self.feed.save_page_history(301, "HTTP Redirect (%s to go)" % (10 - len(redirects)))
                if len(redirects) >= 10 or len(non_redirects) == 0:
                    self.feed.feed_link = new_feed_link
                    self.feed.save(update_fields=['feed_link'])

    def check_feed_for_push(self):
        if not (hasattr(self.fpf, 'feed') and hasattr(self.fpf.feed, 'links') and self.fpf.feed.links):
            return

        hub_url = None
        self_url = self.feed.feed_address
        for link in self.fpf.feed.links:
            if link['rel'] == 'hub' and not hub_url:
                hub_url = link['href']
            elif link['rel'] == 'self':
                self_url = link['href']
        push_expired = False
        if self.feed.is_push:
            try:
                push_expired = self.feed.push.lease_expires < datetime.datetime.now()
            except PushSubscription.DoesNotExist:
                self.feed.is_push = False
        if (
            hub_url
            and self_url
            and not settings.DEBUG
            and self.feed.active_subscribers > 0
            and (push_expired or not self.feed.is_push or self.options.get('force'))
        ):
            logging.debug(
                '   ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s'
                % (self.feed.log_title[:30], "~SKRe-~SN" if push_expired else "", hub_url)
            )
            try:
                if settings.ENABLE_PUSH:
                    PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)
            except TimeoutError:
                logging.debug(
                    '   ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s'
                    % (self.feed.log_title[:30], hub_url)
                )
        elif self.feed.is_push and (self.feed.active_subscribers <= 0 or not hub_url):
            logging.debug(
                '   ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (self.feed.log_title[:30])
            )
            self.feed.is_push = False
            self.feed = self.feed.save()


class FeedFetcherWorker:
    def __init__(self, options):
        self.options = options
        self.feed_stats = {
            FEED_OK: 0,
            FEED_SAME: 0,
            FEED_ERRPARSE: 0,
            FEED_ERRHTTP: 0,
            FEED_ERREXC: 0,
        }
        self.feed_trans = {
            FEED_OK: 'ok',
            FEED_SAME: 'unchanged',
            FEED_ERRPARSE: 'cant_parse',
            FEED_ERRHTTP: 'http_error',
            FEED_ERREXC: 'exception',
        }
        self.feed_keys = sorted(self.feed_trans.keys())
        self.time_start = datetime.datetime.utcnow()

    def refresh_feed(self, feed_id):
        """Update feed, since it may have changed"""
        return Feed.get_by_id(feed_id)

    def reset_database_connections(self):
        connection._connections = {}
        connection._connection_settings = {}
        connection._dbs = {}
        settings.MONGODB = connect(settings.MONGO_DB_NAME, **settings.MONGO_DB)
        if 'username' in settings.MONGO_ANALYTICS_DB:
            settings.MONGOANALYTICSDB = connect(
                db=settings.MONGO_ANALYTICS_DB['name'],
                host=f"mongodb://{settings.MONGO_ANALYTICS_DB['username']}:{settings.MONGO_ANALYTICS_DB['password']}@{settings.MONGO_ANALYTICS_DB['host']}/?authSource=admin",
                alias="nbanalytics",
            )
        else:
            settings.MONGOANALYTICSDB = connect(
                db=settings.MONGO_ANALYTICS_DB['name'],
                host=f"mongodb://{settings.MONGO_ANALYTICS_DB['host']}/",
                alias="nbanalytics",
            )

    def process_feed_wrapper(self, feed_queue):
        self.reset_database_connections()

        delta = None
        current_process = multiprocessing.current_process()
        identity = "X"
        feed = None

        if current_process._identity:
            identity = current_process._identity[0]

        # If fetching archive pages, come back once the archive scaffolding is built
        if self.options.get('archive_page', None):
            for feed_id in feed_queue:
                feed = self.refresh_feed(feed_id)
                try:
                    self.fetch_and_process_archive_pages(feed_id)
                except SoftTimeLimitExceeded:
                    logging.debug(
                        '   ---> [%-30s] ~FRTime limit reached while fetching ~FGarchive pages~FR. Made it to ~SB%s'
                        % (feed.log_title[:30], self.options['archive_page'])
                    )
                    pass
            if len(feed_queue) == 1:
                feed = self.refresh_feed(feed_queue[0])
                return feed
            return

        for feed_id in feed_queue:
            start_duration = time.time()
            feed_fetch_duration = None
            feed_process_duration = None
            page_duration = None
            icon_duration = None
            feed_code = None
            ret_entries = None
            start_time = time.time()
            ret_feed = FEED_ERREXC

            set_user({"id": feed_id})
            try:
                feed = self.refresh_feed(feed_id)
                set_user({"id": feed_id, "username": feed.feed_title})

                skip = False
                if self.options.get('fake'):
                    skip = True
                    weight = "-"
                    quick = "-"
                    rand = "-"
                elif (
                    self.options.get('quick')
                    and not self.options['force']
                    and feed.known_good
                    and feed.fetched_once
                    and not feed.is_push
                ):
                    weight = feed.stories_last_month * feed.num_subscribers
                    random_weight = random.randint(1, max(weight, 1))
                    quick = float(self.options.get('quick', 0))
                    rand = random.random()
                    if random_weight < 1000 and rand < quick:
                        skip = True
                elif False and feed.feed_address.startswith("http://news.google.com/news"):
                    skip = True
                    weight = "-"
                    quick = "-"
                    rand = "-"
                if skip:
                    logging.debug(
                        '   ---> [%-30s] ~BGFaking fetch, skipping (%s/month, %s subs, %s < %s)...'
                        % (feed.log_title[:30], weight, feed.num_subscribers, rand, quick)
                    )
                    continue

                ffeed = FetchFeed(feed_id, self.options)
                ret_feed, fetched_feed = ffeed.fetch()

                feed_fetch_duration = time.time() - start_duration
                raw_feed = ffeed.raw_feed

                if fetched_feed and (ret_feed == FEED_OK or self.options['force']):
                    pfeed = ProcessFeed(feed_id, fetched_feed, self.options, raw_feed=raw_feed)
                    ret_feed, ret_entries = pfeed.process()
                    feed = pfeed.feed
                    feed_process_duration = time.time() - start_duration

                    if (ret_entries and ret_entries['new']) or self.options['force']:
                        start = time.time()
                        if not feed.known_good or not feed.fetched_once:
                            feed.known_good = True
                            feed.fetched_once = True
                            feed = feed.save()
                        if self.options['force'] or random.random() <= 0.02:
                            logging.debug(
                                '   ---> [%-30s] ~FBPerforming feed cleanup...' % (feed.log_title[:30],)
                            )
                            start_cleanup = time.time()
                            feed.count_fs_size_bytes()
                            logging.debug(
                                '   ---> [%-30s] ~FBDone with feed cleanup. Took ~SB%.4s~SN sec.'
                                % (feed.log_title[:30], time.time() - start_cleanup)
                            )
                        try:
                            self.count_unreads_for_subscribers(feed)
                        except TimeoutError:
                            logging.debug(
                                '   ---> [%-30s] Unread count took too long...' % (feed.log_title[:30],)
                            )
                        if self.options['verbose']:
                            logging.debug(
                                '   ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss'
                                % (feed.log_title[:30], time.time() - start)
                            )
            except (urllib.error.HTTPError, urllib.error.URLError) as e:
                logging.debug(
                    '   ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (str(feed_id)[:30], e.reason)
                )
                feed_code = 404
                feed.save_feed_history(feed_code, str(e.reason), e)
                fetched_feed = None
            except Feed.DoesNotExist:
                logging.debug('   ---> [%-30s] ~FRFeed is now gone...' % (str(feed_id)[:30]))
                continue
            except SoftTimeLimitExceeded as e:
                logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
                ret_feed = FEED_ERREXC
                fetched_feed = None
                feed_code = 559
                feed.save_feed_history(feed_code, 'Timeout', e)
            except TimeoutError as e:
                logging.debug('   ---> [%-30s] ~FRFeed fetch timed out...' % (feed.log_title[:30]))
                feed_code = 505
                feed.save_feed_history(feed_code, 'Timeout', e)
                fetched_feed = None
            except Exception as e:
                logging.debug('[%d] ! -------------------------' % (feed_id,))
                tb = traceback.format_exc()
                logging.error(tb)
                logging.debug('[%d] ! -------------------------' % (feed_id,))
                ret_feed = FEED_ERREXC
                feed = Feed.get_by_id(getattr(feed, 'pk', feed_id))
                if not feed:
                    continue
                feed.save_feed_history(500, "Error", tb)
                feed_code = 500
                fetched_feed = None
                # mail_feed_error_to_admin(feed, e, local_vars=locals())
                if not settings.DEBUG and hasattr(settings, 'SENTRY_DSN') and settings.SENTRY_DSN:
                    capture_exception(e)
                    flush()

            if not feed_code:
                if ret_feed == FEED_OK:
                    feed_code = 200
                elif ret_feed == FEED_SAME:
                    feed_code = 304
                elif ret_feed == FEED_ERRHTTP:
                    feed_code = 400
                if ret_feed == FEED_ERREXC:
                    feed_code = 500
                elif ret_feed == FEED_ERRPARSE:
                    feed_code = 550

            if not feed:
                continue
            feed = self.refresh_feed(feed.pk)
            if not feed:
                continue

            if (
                (self.options['force'])
                or (random.random() > 0.9)
                or (
                    fetched_feed
                    and feed.feed_link
                    and feed.has_page
                    and (ret_feed == FEED_OK or (ret_feed == FEED_SAME and feed.stories_last_month > 10))
                )
            ):

                logging.debug('   ---> [%-30s] ~FYFetching page: %s' % (feed.log_title[:30], feed.feed_link))
                page_importer = PageImporter(feed)
                try:
                    page_data = page_importer.fetch_page()
                    page_duration = time.time() - start_duration
                except SoftTimeLimitExceeded as e:
                    logging.debug(
                        " ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed
                    )
                    page_data = None
                    feed.save_feed_history(557, 'Timeout', e)
                except TimeoutError:
                    logging.debug('   ---> [%-30s] ~FRPage fetch timed out...' % (feed.log_title[:30]))
                    page_data = None
                    feed.save_page_history(555, 'Timeout', '')
                except Exception as e:
                    logging.debug('[%d] ! -------------------------' % (feed_id,))
                    tb = traceback.format_exc()
                    logging.error(tb)
                    logging.debug('[%d] ! -------------------------' % (feed_id,))
                    feed.save_page_history(550, "Page Error", tb)
                    fetched_feed = None
                    page_data = None
                    # mail_feed_error_to_admin(feed, e, local_vars=locals())
                    if not settings.DEBUG and hasattr(settings, 'SENTRY_DSN') and settings.SENTRY_DSN:
                        capture_exception(e)
                        flush()

                feed = self.refresh_feed(feed.pk)
                logging.debug('   ---> [%-30s] ~FYFetching icon: %s' % (feed.log_title[:30], feed.feed_link))
                force = self.options['force']
                if random.random() > 0.99:
                    force = True
                icon_importer = IconImporter(feed, page_data=page_data, force=force)
                try:
                    icon_importer.save()
                    icon_duration = time.time() - start_duration
                except SoftTimeLimitExceeded as e:
                    logging.debug(
                        " ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed
                    )
                    feed.save_feed_history(558, 'Timeout', e)
                except TimeoutError:
                    logging.debug('   ---> [%-30s] ~FRIcon fetch timed out...' % (feed.log_title[:30]))
                    feed.save_page_history(556, 'Timeout', '')
                except Exception as e:
                    logging.debug('[%d] ! -------------------------' % (feed_id,))
                    tb = traceback.format_exc()
                    logging.error(tb)
                    logging.debug('[%d] ! -------------------------' % (feed_id,))
                    # feed.save_feed_history(560, "Icon Error", tb)
                    # mail_feed_error_to_admin(feed, e, local_vars=locals())
                    if not settings.DEBUG and hasattr(settings, 'SENTRY_DSN') and settings.SENTRY_DSN:
                        capture_exception(e)
                        flush()
            else:
                logging.debug(
                    '   ---> [%-30s] ~FBSkipping page fetch: (%s on %s stories) %s'
                    % (
                        feed.log_title[:30],
                        self.feed_trans[ret_feed],
                        feed.stories_last_month,
                        '' if feed.has_page else ' [HAS NO PAGE]',
                    )
                )

            feed = self.refresh_feed(feed.pk)
            delta = time.time() - start_time

            feed.last_load_time = round(delta)
            feed.fetched_once = True
            try:
                feed = feed.save(update_fields=['last_load_time', 'fetched_once'])
            except IntegrityError:
                logging.debug(
                    "   ***> [%-30s] ~FRIntegrityError on feed: %s"
                    % (
                        feed.log_title[:30],
                        feed.feed_address,
                    )
                )

            if ret_entries and ret_entries['new']:
                self.publish_to_subscribers(feed, ret_entries['new'])

            done_msg = '%2s ---> [%-30s] ~FYProcessed in ~FM~SB%.4ss~FY~SN (~FB%s~FY) [%s]' % (
                identity,
                feed.log_title[:30],
                delta,
                feed.pk,
                self.feed_trans[ret_feed],
            )
            logging.debug(done_msg)
            total_duration = time.time() - start_duration
            MAnalyticsFetcher.add(
                feed_id=feed.pk,
                feed_fetch=feed_fetch_duration,
                feed_process=feed_process_duration,
                page=page_duration,
                icon=icon_duration,
                total=total_duration,
                feed_code=feed_code,
            )

            self.feed_stats[ret_feed] += 1

        if len(feed_queue) == 1:
            return feed

        # time_taken = datetime.datetime.utcnow() - self.time_start

    def fetch_and_process_archive_pages(self, feed_id):
        feed = Feed.get_by_id(feed_id)
        first_seen_feed = None
        original_starting_page = self.options['archive_page']

        for archive_page_key in ["page", "paged", "rfc5005"]:
            seen_story_hashes = set()
            failed_pages = 0
            self.options['archive_page_key'] = archive_page_key

            if archive_page_key == "rfc5005":
                self.options['archive_page'] = "rfc5005"
                link_prev_archive = None
                if first_seen_feed:
                    for link in getattr(first_seen_feed.feed, 'links', []):
                        if link['rel'] == 'prev-archive' or link['rel'] == 'next':
                            link_prev_archive = link['href']
                            logging.debug('   ---> [%-30s] ~FGFeed has ~SBRFC5005~SN links, filling out archive: %s' % (feed.log_title[:30], link_prev_archive))
                            break
                    else:
                        logging.debug('   ---> [%-30s] ~FBFeed has no RFC5005 links...' % (feed.log_title[:30]))
                else:
                    self.options['archive_page_link'] = link_prev_archive
                    ffeed = FetchFeed(feed_id, self.options)
                    try:
                        ret_feed, fetched_feed = ffeed.fetch()
                    except TimeoutError:
                        logging.debug('   ---> [%-30s] ~FRArchive feed fetch timed out...' % (feed.log_title[:30]))
                        # Timeout means don't bother to keep checking...
                        continue

                    raw_feed = ffeed.raw_feed

                    if fetched_feed and ret_feed == FEED_OK:
                        pfeed = ProcessFeed(feed_id, fetched_feed, self.options, raw_feed=raw_feed)
                        if not pfeed.fpf or not pfeed.fpf.entries:
                            continue
                        for link in getattr(pfeed.fpf.feed, 'links', []):
                            if link['rel'] == 'prev-archive' or link['rel'] == 'next':
                                link_prev_archive = link['href']

                if not link_prev_archive:
                    continue

                while True:
                    if not link_prev_archive:
                        break
                    if link_prev_archive == self.options.get('archive_page_link', None):
                        logging.debug('   ---> [%-30s] ~FRNo change in archive page link: %s' % (feed.log_title[:30], link_prev_archive))
                        break
                    self.options['archive_page_link'] = link_prev_archive
                    link_prev_archive = None
                    ffeed = FetchFeed(feed_id, self.options)
                    try:
                        ret_feed, fetched_feed = ffeed.fetch()
                    except TimeoutError as e:
                        logging.debug('   ---> [%-30s] ~FRArchive feed fetch timed out...' % (feed.log_title[:30]))
                        # Timeout means don't bother to keep checking...
                        break

                    raw_feed = ffeed.raw_feed

                    if fetched_feed and ret_feed == FEED_OK:
                        pfeed = ProcessFeed(feed_id, fetched_feed, self.options, raw_feed=raw_feed)
                        if not pfeed.fpf or not pfeed.fpf.entries:
                            logging.debug('   ---> [%-30s] ~FRFeed parse failed, no entries' % (feed.log_title[:30]))
                            continue
                        for link in getattr(pfeed.fpf.feed, 'links', []):
                            if link['rel'] == 'prev-archive' or link['rel'] == 'next':
                                link_prev_archive = link['href']
                                logging.debug('   ---> [%-30s] ~FGFeed still has ~SBRFC5005~SN links, continuing filling out archive: %s' % (feed.log_title[:30], link_prev_archive))
                                break
                        else:
                            logging.debug('   ---> [%-30s] ~FBFeed has no more RFC5005 links...' % (feed.log_title[:30]))
                            break

                        before_story_hashes = len(seen_story_hashes)
                        pfeed.process()
                        seen_story_hashes.update(pfeed.archive_seen_story_hashes)
                        after_story_hashes = len(seen_story_hashes)

                        if before_story_hashes == after_story_hashes:
                            logging.debug('   ---> [%-30s] ~FRNo change in story hashes, but has archive link: %s' % (feed.log_title[:30], link_prev_archive))

                failed_color = "~FR" if not link_prev_archive else ""
                logging.debug(f"   ---> [{feed.log_title[:30]:<30}] ~FGStory hashes found, archive RFC5005 ~SB{link_prev_archive}~SN: ~SB~FG{failed_color}{len(seen_story_hashes):,} stories~SN~FB")
            else:
                for page in range(3 if settings.DEBUG and False else 150):
                    if page < original_starting_page:
                        continue
                    if failed_pages >= 1:
                        break
                    self.options['archive_page'] = page+1

                    ffeed = FetchFeed(feed_id, self.options)
                    try:
                        ret_feed, fetched_feed = ffeed.fetch()
                    except TimeoutError as e:
                        logging.debug('   ---> [%-30s] ~FRArchive feed fetch timed out...' % (feed.log_title[:30]))
                        # Timeout means don't bother to keep checking...
                        break

                    raw_feed = ffeed.raw_feed

                    if fetched_feed and ret_feed == FEED_OK:
                        pfeed = ProcessFeed(feed_id, fetched_feed, self.options, raw_feed=raw_feed)
                        if not pfeed.fpf or not pfeed.fpf.entries:
                            failed_pages += 1
                            continue

                        if not first_seen_feed:
                            first_seen_feed = pfeed.fpf
                        before_story_hashes = len(seen_story_hashes)
                        pfeed.process()
                        seen_story_hashes.update(pfeed.archive_seen_story_hashes)
                        after_story_hashes = len(seen_story_hashes)

                        if before_story_hashes == after_story_hashes:
                            failed_pages += 1
                    else:
                        failed_pages += 1
                    failed_color = "~FR" if failed_pages > 0 else ""
                    logging.debug(f"   ---> [{feed.log_title[:30]:<30}] ~FGStory hashes found, archive page ~SB{page+1}~SN: ~SB~FG{len(seen_story_hashes):,} stories~SN~FB, {failed_color}{failed_pages} failures")

    def publish_to_subscribers(self, feed, new_count):
        try:
            r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL)
            listeners_count = r.publish(str(feed.pk), 'story:new_count:%s' % new_count)
            if listeners_count:
                logging.debug(
                    "   ---> [%-30s] ~FMPublished to %s subscribers" % (feed.log_title[:30], listeners_count)
                )
        except redis.ConnectionError:
            logging.debug("   ***> [%-30s] ~BMRedis is unavailable for real-time." % (feed.log_title[:30],))

    def count_unreads_for_subscribers(self, feed):
        subscriber_expire = datetime.datetime.now() - datetime.timedelta(days=settings.SUBSCRIBER_EXPIRE)

        user_subs = UserSubscription.objects.filter(
            feed=feed, active=True, user__profile__last_seen_on__gte=subscriber_expire
        ).order_by('-last_read_date')

        if not user_subs.count():
            return

        for sub in user_subs:
            if not sub.needs_unread_recalc:
                sub.needs_unread_recalc = True
                sub.save()

        if self.options['compute_scores']:
            r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL)
            stories = MStory.objects(story_feed_id=feed.pk, story_date__gte=feed.unread_cutoff)
            stories = Feed.format_stories(stories, feed.pk)
            story_hashes = r.zrangebyscore(
                'zF:%s' % feed.pk,
                int(feed.unread_cutoff.strftime('%s')),
                int(time.time() + 60 * 60 * 24),
            )
            missing_story_hashes = set(story_hashes) - set([s['story_hash'] for s in stories])
            if missing_story_hashes:
                missing_stories = MStory.objects(
                    story_feed_id=feed.pk, story_hash__in=missing_story_hashes
                ).read_preference(pymongo.ReadPreference.PRIMARY)
                missing_stories = Feed.format_stories(missing_stories, feed.pk)
                stories = missing_stories + stories
                logging.debug(
                    '   ---> [%-30s] ~FYFound ~SB~FC%s(of %s)/%s~FY~SN un-secondaried stories while computing scores'
                    % (
                        feed.log_title[:30],
                        len(missing_stories),
                        len(missing_story_hashes),
                        len(stories),
                    )
                )
            cache.set("S:v3:%s" % feed.pk, stories, 60)
            logging.debug(
                '   ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)'
                % (
                    feed.log_title[:30],
                    len(stories),
                    user_subs.count(),
                    feed.num_subscribers,
                    feed.active_subscribers,
                    feed.premium_subscribers,
                )
            )
            self.calculate_feed_scores_with_stories(user_subs, stories)
        elif self.options.get('mongodb_replication_lag'):
            logging.debug(
                '   ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag'
                % (feed.log_title[:30], self.options.get('mongodb_replication_lag'))
            )

    @timelimit(10)
    def calculate_feed_scores_with_stories(self, user_subs, stories):
        for sub in user_subs:
            silent = False if getattr(self.options, 'verbose', 0) >= 2 else True
            sub.calculate_feed_scores(silent=silent, stories=stories)


class Dispatcher:
    def __init__(self, options, num_threads):
        self.options = options
        self.num_threads = num_threads
        self.workers = []

    def add_jobs(self, feeds_queue, feeds_count=1):
        """adds a feed processing job to the pool"""
        self.feeds_queue = feeds_queue
        self.feeds_count = feeds_count

    def run_jobs(self):
        if self.options['single_threaded'] or self.num_threads == 1:
            return dispatch_workers(self.feeds_queue[0], self.options)
        else:
            for i in range(self.num_threads):
                feed_queue = self.feeds_queue[i]
                self.workers.append(
                    multiprocessing.Process(target=dispatch_workers, args=(feed_queue, self.options))
                )
            for i in range(self.num_threads):
                self.workers[i].start()


def dispatch_workers(feed_queue, options):
    worker = FeedFetcherWorker(options)
    return worker.process_feed_wrapper(feed_queue)