NewsBlur-viq/apps/rss_feeds/tasks.py

241 lines
7.6 KiB
Python
Raw Normal View History

import datetime
import os
import shutil
import time
2024-02-28 09:33:10 -05:00
import redis
from celery.exceptions import SoftTimeLimitExceeded
from django.conf import settings
2024-02-28 09:33:10 -05:00
from apps.profile.middleware import DBProfilerMiddleware
2024-02-28 09:33:10 -05:00
from newsblur_web.celeryapp import app
from utils import log as logging
from utils.redis_raw_log_middleware import RedisDumpMiddleware
2024-02-28 09:33:10 -05:00
FEED_TASKING_MAX = 10000
2024-04-24 09:43:56 -04:00
@app.task(name="task-feeds")
2020-11-13 12:14:37 -05:00
def TaskFeeds():
2024-04-24 09:43:56 -04:00
from apps.rss_feeds.models import Feed
2020-11-13 12:14:37 -05:00
settings.LOG_TO_STREAM = True
now = datetime.datetime.utcnow()
start = time.time()
r = redis.Redis(connection_pool=settings.REDIS_FEED_UPDATE_POOL)
2024-04-24 09:43:56 -04:00
tasked_feeds_size = r.zcard("tasked_feeds")
2020-11-13 12:14:37 -05:00
hour_ago = now - datetime.timedelta(hours=1)
2024-04-24 09:43:56 -04:00
r.zremrangebyscore("fetched_feeds_last_hour", 0, int(hour_ago.strftime("%s")))
2020-11-13 12:14:37 -05:00
now_timestamp = int(now.strftime("%s"))
2024-04-24 09:43:56 -04:00
queued_feeds = r.zrangebyscore("scheduled_updates", 0, now_timestamp)
r.zremrangebyscore("scheduled_updates", 0, now_timestamp)
2020-11-13 12:14:37 -05:00
if not queued_feeds:
logging.debug(" ---> ~SN~FB~BMNo feeds to queue! Exiting...")
return
2024-04-24 09:43:56 -04:00
r.sadd("queued_feeds", *queued_feeds)
logging.debug(
" ---> ~SN~FBQueuing ~SB%s~SN stale feeds (~SB%s~SN/~FG%s~FB~SN/%s tasked/queued/scheduled)"
% (len(queued_feeds), r.zcard("tasked_feeds"), r.scard("queued_feeds"), r.zcard("scheduled_updates"))
)
2020-11-13 12:14:37 -05:00
# Regular feeds
if tasked_feeds_size < FEED_TASKING_MAX:
2024-04-24 09:43:56 -04:00
feeds = r.srandmember("queued_feeds", FEED_TASKING_MAX)
2020-11-13 12:14:37 -05:00
Feed.task_feeds(feeds, verbose=True)
active_count = len(feeds)
else:
logging.debug(" ---> ~SN~FBToo many tasked feeds. ~SB%s~SN tasked." % tasked_feeds_size)
active_count = 0
2024-02-28 09:33:10 -05:00
feeds = []
2024-04-24 09:43:56 -04:00
logging.debug(
" ---> ~SN~FBTasking %s feeds took ~SB%s~SN seconds (~SB%s~SN/~FG%s~FB~SN/%s tasked/queued/scheduled)"
% (
active_count,
int((time.time() - start)),
r.zcard("tasked_feeds"),
r.scard("queued_feeds"),
r.zcard("scheduled_updates"),
)
)
logging.debug(" ---> ~FBFeeds being tasked: ~SB%s" % feeds)
2020-11-13 12:14:37 -05:00
2024-04-24 09:43:56 -04:00
@app.task(name="task-broken-feeds")
2020-11-13 12:14:37 -05:00
def TaskBrokenFeeds():
2024-04-24 09:43:56 -04:00
from apps.rss_feeds.models import Feed
2020-11-13 12:14:37 -05:00
settings.LOG_TO_STREAM = True
now = datetime.datetime.utcnow()
start = time.time()
r = redis.Redis(connection_pool=settings.REDIS_FEED_UPDATE_POOL)
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
logging.debug(" ---> ~SN~FBQueuing broken feeds...")
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
# Force refresh feeds
2024-04-24 09:43:56 -04:00
refresh_feeds = Feed.objects.filter(active=True, fetched_once=False, active_subscribers__gte=1).order_by(
"?"
)[:100]
2020-11-13 12:14:37 -05:00
refresh_count = refresh_feeds.count()
cp1 = time.time()
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
logging.debug(" ---> ~SN~FBFound %s active, unfetched broken feeds" % refresh_count)
# Mistakenly inactive feeds
2024-04-24 09:43:56 -04:00
hours_ago = (now - datetime.timedelta(minutes=10)).strftime("%s")
old_tasked_feeds = r.zrangebyscore("tasked_feeds", 0, hours_ago)
2020-11-13 12:14:37 -05:00
inactive_count = len(old_tasked_feeds)
if inactive_count:
2024-04-24 09:43:56 -04:00
r.zremrangebyscore("tasked_feeds", 0, hours_ago)
2020-11-13 12:14:37 -05:00
# r.sadd('queued_feeds', *old_tasked_feeds)
for feed_id in old_tasked_feeds:
2024-04-24 09:43:56 -04:00
r.zincrby("error_feeds", 1, feed_id)
2020-11-13 12:14:37 -05:00
feed = Feed.get_by_id(feed_id)
feed.set_next_scheduled_update()
2024-04-24 09:43:56 -04:00
logging.debug(
" ---> ~SN~FBRe-queuing ~SB%s~SN dropped/broken feeds (~SB%s/%s~SN queued/tasked)"
% (inactive_count, r.scard("queued_feeds"), r.zcard("tasked_feeds"))
)
2020-11-13 12:14:37 -05:00
cp2 = time.time()
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
old = now - datetime.timedelta(days=1)
2024-04-24 09:43:56 -04:00
old_feeds = Feed.objects.filter(next_scheduled_update__lte=old, active_subscribers__gte=1).order_by("?")[
:500
]
2020-11-13 12:14:37 -05:00
old_count = old_feeds.count()
cp3 = time.time()
2024-04-24 09:43:56 -04:00
logging.debug(
" ---> ~SN~FBTasking ~SBrefresh:~FC%s~FB inactive:~FC%s~FB old:~FC%s~SN~FB broken feeds... (%.4s/%.4s/%.4s)"
% (
refresh_count,
inactive_count,
old_count,
cp1 - start,
cp2 - cp1,
cp3 - cp2,
)
)
2020-11-13 12:14:37 -05:00
Feed.task_feeds(refresh_feeds, verbose=False)
Feed.task_feeds(old_feeds, verbose=False)
2024-04-24 09:43:56 -04:00
logging.debug(
" ---> ~SN~FBTasking broken feeds took ~SB%s~SN seconds (~SB%s~SN/~FG%s~FB~SN/%s tasked/queued/scheduled)"
% (
int((time.time() - start)),
r.zcard("tasked_feeds"),
r.scard("queued_feeds"),
r.zcard("scheduled_updates"),
)
)
@app.task(name="update-feeds", time_limit=10 * 60, soft_time_limit=9 * 60, ignore_result=True)
2020-11-13 12:14:37 -05:00
def UpdateFeeds(feed_pks):
from apps.rss_feeds.models import Feed
from apps.statistics.models import MStatistics
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
r = redis.Redis(connection_pool=settings.REDIS_FEED_UPDATE_POOL)
2024-04-24 09:43:56 -04:00
mongodb_replication_lag = int(MStatistics.get("mongodb_replication_lag", 0))
2020-11-13 12:14:37 -05:00
compute_scores = bool(mongodb_replication_lag < 10)
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
profiler = DBProfilerMiddleware()
profiler_activated = profiler.process_celery()
if profiler_activated:
settings.MONGO_COMMAND_LOGGER.process_celery(profiler)
2020-11-13 12:14:37 -05:00
redis_middleware = RedisDumpMiddleware()
redis_middleware.process_celery(profiler)
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
options = {
2024-04-24 09:43:56 -04:00
"quick": float(MStatistics.get("quick_fetch", 0)),
"updates_off": MStatistics.get("updates_off", False),
"compute_scores": compute_scores,
"mongodb_replication_lag": mongodb_replication_lag,
2020-11-13 12:14:37 -05:00
}
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
if not isinstance(feed_pks, list):
feed_pks = [feed_pks]
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
for feed_pk in feed_pks:
feed = Feed.get_by_id(feed_pk)
if not feed or feed.pk != int(feed_pk):
2024-04-24 09:43:56 -04:00
logging.info(
" ---> ~FRRemoving feed_id %s from tasked_feeds queue, points to %s..."
% (feed_pk, feed and feed.pk)
)
r.zrem("tasked_feeds", feed_pk)
2020-11-13 12:14:37 -05:00
if not feed:
continue
try:
feed.update(**options)
2020-12-03 14:16:47 -05:00
except SoftTimeLimitExceeded as e:
2024-04-24 09:43:56 -04:00
feed.save_feed_history(505, "Timeout", e)
2020-11-13 12:14:37 -05:00
logging.info(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
2024-04-24 09:43:56 -04:00
if profiler_activated:
profiler.process_celery_finished()
2020-11-13 12:14:37 -05:00
2024-04-24 09:43:56 -04:00
@app.task(name="new-feeds", time_limit=10 * 60, soft_time_limit=9 * 60, ignore_result=True)
2020-11-13 12:14:37 -05:00
def NewFeeds(feed_pks):
from apps.rss_feeds.models import Feed
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
if not isinstance(feed_pks, list):
feed_pks = [feed_pks]
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
options = {}
for feed_pk in feed_pks:
feed = Feed.get_by_id(feed_pk)
2024-04-24 09:43:56 -04:00
if not feed:
continue
2020-11-13 12:14:37 -05:00
feed.update(options=options)
2024-04-24 09:43:56 -04:00
@app.task(name="push-feeds", ignore_result=True)
2020-11-13 12:14:37 -05:00
def PushFeeds(feed_id, xml):
from apps.rss_feeds.models import Feed
from apps.statistics.models import MStatistics
2024-04-24 09:43:56 -04:00
mongodb_replication_lag = int(MStatistics.get("mongodb_replication_lag", 0))
2020-11-13 12:14:37 -05:00
compute_scores = bool(mongodb_replication_lag < 60)
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
options = {
2024-04-24 09:43:56 -04:00
"feed_xml": xml,
"compute_scores": compute_scores,
"mongodb_replication_lag": mongodb_replication_lag,
2020-11-13 12:14:37 -05:00
}
feed = Feed.get_by_id(feed_id)
if feed:
feed.update(options=options)
2024-04-24 09:43:56 -04:00
2020-11-13 13:26:25 -05:00
@app.task()
2020-11-13 12:14:37 -05:00
def ScheduleImmediateFetches(feed_ids, user_id=None):
from apps.rss_feeds.models import Feed
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
if not isinstance(feed_ids, list):
feed_ids = [feed_ids]
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
Feed.schedule_feed_fetches_immediately(feed_ids, user_id=user_id)
2020-11-13 13:26:25 -05:00
@app.task()
2020-11-13 12:14:37 -05:00
def SchedulePremiumSetup(feed_ids):
from apps.rss_feeds.models import Feed
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
if not isinstance(feed_ids, list):
feed_ids = [feed_ids]
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
Feed.setup_feeds_for_premium_subscribers(feed_ids)
2024-04-24 09:43:56 -04:00
2020-11-13 13:26:25 -05:00
@app.task()
2020-11-13 12:14:37 -05:00
def ScheduleCountTagsForUser(user_id):
from apps.rss_feeds.models import MStarredStoryCounts
2024-04-24 09:43:56 -04:00
2020-11-13 12:14:37 -05:00
MStarredStoryCounts.count_for_user(user_id)