Strip underscores from feed addresses, special case jwz

This commit is contained in:
Samuel Clay 2024-06-30 12:13:29 -04:00
parent db30d559a0
commit 546f338b8c
4 changed files with 20 additions and 8 deletions

View file

@ -10,6 +10,7 @@ import urllib.request
from io import BytesIO
from socket import error as SocketError
import numpy as np
import boto3
import lxml.html
import numpy
@ -380,16 +381,16 @@ class IconImporter(object):
# Reshape array of values to merge color bands. [[R], [G], [B], [A]] => [R, G, B, A]
if len(shape) > 2:
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
ar = ar.reshape(np.product(shape[:2]), shape[2])
# Get NUM_CLUSTERS worth of centroids.
ar = ar.astype(numpy.float)
ar = ar.astype(float)
codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
# Pare centroids, removing blacks and whites and shades of really dark and really light.
original_codes = codes
for low, hi in [(60, 200), (35, 230), (10, 250)]:
codes = scipy.array(
codes = np.array(
[
code
for code in codes

View file

@ -46,6 +46,7 @@ from utils.feed_functions import (
levenshtein_distance,
relative_timesince,
seconds_timesince,
strip_underscore_from_feed_address,
timelimit,
)
from utils.fields import AutoOneToOneField
@ -744,7 +745,7 @@ class Feed(models.Model):
logging.debug(" ---> Feed points to 'Wierdo' or 'feedsportal', ignoring.")
return False, self
try:
self.feed_address = feed_address
self.feed_address = strip_underscore_from_feed_address(feed_address)
feed = self.save()
feed.count_subscribers()
# feed.schedule_feed_fetch_immediately() # Don't fetch as it can get stuck in a loop
@ -3936,7 +3937,7 @@ def merge_feeds(original_feed_id, duplicate_feed_id, force=False):
original_feed, duplicate_feed = duplicate_feed, original_feed
original_feed_id, duplicate_feed_id = duplicate_feed_id, original_feed_id
if branched_original:
original_feed.feed_address = duplicate_feed.feed_address
original_feed.feed_address = strip_underscore_from_feed_address(duplicate_feed.feed_address)
logging.info(
" ---> Feed: [%s - %s] %s - %s"

View file

@ -52,7 +52,7 @@ from sentry_sdk import capture_exception, flush
from utils import json_functions as json
from utils import log as logging
from utils.facebook_fetcher import FacebookFetcher
from utils.feed_functions import TimeoutError, timelimit
from utils.feed_functions import TimeoutError, strip_underscore_from_feed_address, timelimit
from utils.json_fetcher import JSONFetcher
from utils.story_functions import linkify, pre_process_story, strip_tags
from utils.twitter_fetcher import TwitterFetcher
@ -66,6 +66,8 @@ from utils.youtube_fetcher import YoutubeFetcher
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = list(range(5))
NO_UNDERSCORE_ADDRESSES = ["jwz"]
class FetchFeed:
def __init__(self, feed_id, options):
@ -111,7 +113,7 @@ class FetchFeed:
address = self.options["archive_page_link"]
elif self.options.get("archive_page", None):
address = qurl(address, add={self.options["archive_page_key"]: self.options["archive_page"]})
elif address.startswith("http"):
elif address.startswith("http") and not any(item in address for item in NO_UNDERSCORE_ADDRESSES):
address = qurl(address, add={"_": random.randint(0, 10000)})
logging.debug(" ---> [%-30s] ~FBForcing fetch: %s" % (self.feed.log_title[:30], address))
elif not self.feed.fetched_once or not self.feed.known_good:
@ -521,7 +523,7 @@ class ProcessFeed:
address = self.fpf.href
if self.options["force"] and address:
address = qurl(address, remove=["_"])
self.feed.feed_address = address
self.feed.feed_address = strip_underscore_from_feed_address(address)
if not self.feed.known_good:
self.feed.fetched_once = True
logging.debug(

View file

@ -9,6 +9,7 @@ import urllib.parse
import urllib.request
import warnings
from qurl import qurl
from django.utils.encoding import smart_str
from django.utils.translation import ungettext
@ -436,3 +437,10 @@ if __name__ == "__main__":
def chunks(l, n):
for i in range(0, len(l), n):
yield l[i : i + n]
def strip_underscore_from_feed_address(feed_address):
# Strip _=#### from feed_address
parsed_url = qurl(feed_address)
parsed_url.remove_query_param("_")
return parsed_url.url