mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-09-18 21:50:56 +00:00
Strip underscores from feed addresses, special case jwz
This commit is contained in:
parent
db30d559a0
commit
546f338b8c
4 changed files with 20 additions and 8 deletions
|
@ -10,6 +10,7 @@ import urllib.request
|
|||
from io import BytesIO
|
||||
from socket import error as SocketError
|
||||
|
||||
import numpy as np
|
||||
import boto3
|
||||
import lxml.html
|
||||
import numpy
|
||||
|
@ -380,16 +381,16 @@ class IconImporter(object):
|
|||
|
||||
# Reshape array of values to merge color bands. [[R], [G], [B], [A]] => [R, G, B, A]
|
||||
if len(shape) > 2:
|
||||
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
|
||||
ar = ar.reshape(np.product(shape[:2]), shape[2])
|
||||
|
||||
# Get NUM_CLUSTERS worth of centroids.
|
||||
ar = ar.astype(numpy.float)
|
||||
ar = ar.astype(float)
|
||||
codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
|
||||
|
||||
# Pare centroids, removing blacks and whites and shades of really dark and really light.
|
||||
original_codes = codes
|
||||
for low, hi in [(60, 200), (35, 230), (10, 250)]:
|
||||
codes = scipy.array(
|
||||
codes = np.array(
|
||||
[
|
||||
code
|
||||
for code in codes
|
||||
|
|
|
@ -46,6 +46,7 @@ from utils.feed_functions import (
|
|||
levenshtein_distance,
|
||||
relative_timesince,
|
||||
seconds_timesince,
|
||||
strip_underscore_from_feed_address,
|
||||
timelimit,
|
||||
)
|
||||
from utils.fields import AutoOneToOneField
|
||||
|
@ -744,7 +745,7 @@ class Feed(models.Model):
|
|||
logging.debug(" ---> Feed points to 'Wierdo' or 'feedsportal', ignoring.")
|
||||
return False, self
|
||||
try:
|
||||
self.feed_address = feed_address
|
||||
self.feed_address = strip_underscore_from_feed_address(feed_address)
|
||||
feed = self.save()
|
||||
feed.count_subscribers()
|
||||
# feed.schedule_feed_fetch_immediately() # Don't fetch as it can get stuck in a loop
|
||||
|
@ -3936,7 +3937,7 @@ def merge_feeds(original_feed_id, duplicate_feed_id, force=False):
|
|||
original_feed, duplicate_feed = duplicate_feed, original_feed
|
||||
original_feed_id, duplicate_feed_id = duplicate_feed_id, original_feed_id
|
||||
if branched_original:
|
||||
original_feed.feed_address = duplicate_feed.feed_address
|
||||
original_feed.feed_address = strip_underscore_from_feed_address(duplicate_feed.feed_address)
|
||||
|
||||
logging.info(
|
||||
" ---> Feed: [%s - %s] %s - %s"
|
||||
|
|
|
@ -52,7 +52,7 @@ from sentry_sdk import capture_exception, flush
|
|||
from utils import json_functions as json
|
||||
from utils import log as logging
|
||||
from utils.facebook_fetcher import FacebookFetcher
|
||||
from utils.feed_functions import TimeoutError, timelimit
|
||||
from utils.feed_functions import TimeoutError, strip_underscore_from_feed_address, timelimit
|
||||
from utils.json_fetcher import JSONFetcher
|
||||
from utils.story_functions import linkify, pre_process_story, strip_tags
|
||||
from utils.twitter_fetcher import TwitterFetcher
|
||||
|
@ -66,6 +66,8 @@ from utils.youtube_fetcher import YoutubeFetcher
|
|||
|
||||
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = list(range(5))
|
||||
|
||||
NO_UNDERSCORE_ADDRESSES = ["jwz"]
|
||||
|
||||
|
||||
class FetchFeed:
|
||||
def __init__(self, feed_id, options):
|
||||
|
@ -111,7 +113,7 @@ class FetchFeed:
|
|||
address = self.options["archive_page_link"]
|
||||
elif self.options.get("archive_page", None):
|
||||
address = qurl(address, add={self.options["archive_page_key"]: self.options["archive_page"]})
|
||||
elif address.startswith("http"):
|
||||
elif address.startswith("http") and not any(item in address for item in NO_UNDERSCORE_ADDRESSES):
|
||||
address = qurl(address, add={"_": random.randint(0, 10000)})
|
||||
logging.debug(" ---> [%-30s] ~FBForcing fetch: %s" % (self.feed.log_title[:30], address))
|
||||
elif not self.feed.fetched_once or not self.feed.known_good:
|
||||
|
@ -521,7 +523,7 @@ class ProcessFeed:
|
|||
address = self.fpf.href
|
||||
if self.options["force"] and address:
|
||||
address = qurl(address, remove=["_"])
|
||||
self.feed.feed_address = address
|
||||
self.feed.feed_address = strip_underscore_from_feed_address(address)
|
||||
if not self.feed.known_good:
|
||||
self.feed.fetched_once = True
|
||||
logging.debug(
|
||||
|
|
|
@ -9,6 +9,7 @@ import urllib.parse
|
|||
import urllib.request
|
||||
import warnings
|
||||
|
||||
from qurl import qurl
|
||||
from django.utils.encoding import smart_str
|
||||
from django.utils.translation import ungettext
|
||||
|
||||
|
@ -436,3 +437,10 @@ if __name__ == "__main__":
|
|||
def chunks(l, n):
|
||||
for i in range(0, len(l), n):
|
||||
yield l[i : i + n]
|
||||
|
||||
|
||||
def strip_underscore_from_feed_address(feed_address):
|
||||
# Strip _=#### from feed_address
|
||||
parsed_url = qurl(feed_address)
|
||||
parsed_url.remove_query_param("_")
|
||||
return parsed_url.url
|
||||
|
|
Loading…
Add table
Reference in a new issue