Strip underscores from feed addresses, special case jwz

2025-09-18 21:50:56 +00:00 · 2024-06-30 12:13:29 -04:00 · 2024-06-30 12:13:29 -04:00 · 546f338b8c
commit 546f338b8c
parent db30d559a0
4 changed files with 20 additions and 8 deletions
--- a/apps/rss_feeds/icon_importer.py
+++ b/apps/rss_feeds/icon_importer.py
@ -10,6 +10,7 @@ import urllib.request
 from io import BytesIO
 from socket import error as SocketError

+import numpy as np
 import boto3
 import lxml.html
 import numpy
@ -380,16 +381,16 @@ class IconImporter(object):

        # Reshape array of values to merge color bands. [[R], [G], [B], [A]] => [R, G, B, A]
        if len(shape) > 2:
-            ar = ar.reshape(scipy.product(shape[:2]), shape[2])
+            ar = ar.reshape(np.product(shape[:2]), shape[2])

        # Get NUM_CLUSTERS worth of centroids.
-        ar = ar.astype(numpy.float)
+        ar = ar.astype(float)
        codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)

        # Pare centroids, removing blacks and whites and shades of really dark and really light.
        original_codes = codes
        for low, hi in [(60, 200), (35, 230), (10, 250)]:
-            codes = scipy.array(
+            codes = np.array(
                [
                    code
                    for code in codes
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@ -46,6 +46,7 @@ from utils.feed_functions import (
    levenshtein_distance,
    relative_timesince,
    seconds_timesince,
+    strip_underscore_from_feed_address,
    timelimit,
 )
 from utils.fields import AutoOneToOneField
@ -744,7 +745,7 @@ class Feed(models.Model):
                    logging.debug("  ---> Feed points to 'Wierdo' or 'feedsportal', ignoring.")
                    return False, self
                try:
-                    self.feed_address = feed_address
+                    self.feed_address = strip_underscore_from_feed_address(feed_address)
                    feed = self.save()
                    feed.count_subscribers()
                    # feed.schedule_feed_fetch_immediately() # Don't fetch as it can get stuck in a loop
@ -3936,7 +3937,7 @@ def merge_feeds(original_feed_id, duplicate_feed_id, force=False):
        original_feed, duplicate_feed = duplicate_feed, original_feed
        original_feed_id, duplicate_feed_id = duplicate_feed_id, original_feed_id
        if branched_original:
-            original_feed.feed_address = duplicate_feed.feed_address
+            original_feed.feed_address = strip_underscore_from_feed_address(duplicate_feed.feed_address)

    logging.info(
        " ---> Feed: [%s - %s] %s - %s"
--- a/utils/feed_fetcher.py
+++ b/utils/feed_fetcher.py
@ -52,7 +52,7 @@ from sentry_sdk import capture_exception, flush
 from utils import json_functions as json
 from utils import log as logging
 from utils.facebook_fetcher import FacebookFetcher
-from utils.feed_functions import TimeoutError, timelimit
+from utils.feed_functions import TimeoutError, strip_underscore_from_feed_address, timelimit
 from utils.json_fetcher import JSONFetcher
 from utils.story_functions import linkify, pre_process_story, strip_tags
 from utils.twitter_fetcher import TwitterFetcher
@ -66,6 +66,8 @@ from utils.youtube_fetcher import YoutubeFetcher

 FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = list(range(5))

+NO_UNDERSCORE_ADDRESSES = ["jwz"]
+

 class FetchFeed:
    def __init__(self, feed_id, options):
@ -111,7 +113,7 @@ class FetchFeed:
                address = self.options["archive_page_link"]
            elif self.options.get("archive_page", None):
                address = qurl(address, add={self.options["archive_page_key"]: self.options["archive_page"]})
-            elif address.startswith("http"):
+            elif address.startswith("http") and not any(item in address for item in NO_UNDERSCORE_ADDRESSES):
                address = qurl(address, add={"_": random.randint(0, 10000)})
            logging.debug("   ---> [%-30s] ~FBForcing fetch: %s" % (self.feed.log_title[:30], address))
        elif not self.feed.fetched_once or not self.feed.known_good:
@ -521,7 +523,7 @@ class ProcessFeed:
                    address = self.fpf.href
                    if self.options["force"] and address:
                        address = qurl(address, remove=["_"])
-                    self.feed.feed_address = address
+                    self.feed.feed_address = strip_underscore_from_feed_address(address)
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug(
--- a/utils/feed_functions.py
+++ b/utils/feed_functions.py
@ -9,6 +9,7 @@ import urllib.parse
 import urllib.request
 import warnings

+from qurl import qurl
 from django.utils.encoding import smart_str
 from django.utils.translation import ungettext

@ -436,3 +437,10 @@ if __name__ == "__main__":
 def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i : i + n]
+
+
+def strip_underscore_from_feed_address(feed_address):
+    # Strip _=#### from feed_address
+    parsed_url = qurl(feed_address)
+    parsed_url.remove_query_param("_")
+    return parsed_url.url