Fixing broken 403 check

This commit is contained in:
Samuel Clay 2025-04-07 09:50:53 -07:00
parent a91fd46abc
commit 97d1e12ad3

View file

@ -1,8 +1,8 @@
import datetime import datetime
import html
import multiprocessing import multiprocessing
import time import time
import traceback import traceback
import html
import django import django
@ -103,7 +103,6 @@ def preprocess_feed_encoding(raw_xml):
"‰", # Bullet point (•) "‰", # Bullet point (•)
"‮", # Registered trademark (®) "‮", # Registered trademark (®)
"
", # Copyright (©) "
", # Copyright (©)
# Additional patterns that indicate encoding issues # Additional patterns that indicate encoding issues
"é", # é misencoded "é", # é misencoded
"î", # ® misencoded "î", # ® misencoded
@ -125,7 +124,7 @@ def preprocess_feed_encoding(raw_xml):
# Step 2: Encoding Reinterpretation # Step 2: Encoding Reinterpretation
# Re-encode as Latin-1/Windows-1252 and decode as UTF-8 # Re-encode as Latin-1/Windows-1252 and decode as UTF-8
# This "encoding shuffle" restores the original characters # This "encoding shuffle" restores the original characters
corrected = unescaped.encode('latin1').decode('utf-8', errors='replace') corrected = unescaped.encode("latin1").decode("utf-8", errors="replace")
return corrected return corrected
except (UnicodeError, AttributeError) as e: except (UnicodeError, AttributeError) as e:
@ -136,6 +135,7 @@ def preprocess_feed_encoding(raw_xml):
# If no indicators are found, return the original XML # If no indicators are found, return the original XML
return raw_xml return raw_xml
# from utils.feed_functions import mail_feed_error_to_admin # from utils.feed_functions import mail_feed_error_to_admin
@ -224,7 +224,10 @@ class FetchFeed:
# Apply encoding preprocessing to special feed content # Apply encoding preprocessing to special feed content
processed_youtube_feed = preprocess_feed_encoding(youtube_feed) processed_youtube_feed = preprocess_feed_encoding(youtube_feed)
if processed_youtube_feed != youtube_feed: if processed_youtube_feed != youtube_feed:
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to YouTube feed" % (self.feed.log_title[:30])) logging.debug(
" ---> [%-30s] ~FGApplied encoding correction to YouTube feed"
% (self.feed.log_title[:30])
)
self.fpf = feedparser.parse(processed_youtube_feed, sanitize_html=False) self.fpf = feedparser.parse(processed_youtube_feed, sanitize_html=False)
elif re.match(r"(https?)?://twitter.com/\w+/?", qurl(address, remove=["_"])): elif re.match(r"(https?)?://twitter.com/\w+/?", qurl(address, remove=["_"])):
twitter_feed = self.fetch_twitter(address) twitter_feed = self.fetch_twitter(address)
@ -236,7 +239,10 @@ class FetchFeed:
# Apply encoding preprocessing to special feed content # Apply encoding preprocessing to special feed content
processed_twitter_feed = preprocess_feed_encoding(twitter_feed) processed_twitter_feed = preprocess_feed_encoding(twitter_feed)
if processed_twitter_feed != twitter_feed: if processed_twitter_feed != twitter_feed:
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to Twitter feed" % (self.feed.log_title[:30])) logging.debug(
" ---> [%-30s] ~FGApplied encoding correction to Twitter feed"
% (self.feed.log_title[:30])
)
self.fpf = feedparser.parse(processed_twitter_feed) self.fpf = feedparser.parse(processed_twitter_feed)
elif re.match(r"(.*?)facebook.com/\w+/?$", qurl(address, remove=["_"])): elif re.match(r"(.*?)facebook.com/\w+/?$", qurl(address, remove=["_"])):
facebook_feed = self.fetch_facebook() facebook_feed = self.fetch_facebook()
@ -248,7 +254,10 @@ class FetchFeed:
# Apply encoding preprocessing to special feed content # Apply encoding preprocessing to special feed content
processed_facebook_feed = preprocess_feed_encoding(facebook_feed) processed_facebook_feed = preprocess_feed_encoding(facebook_feed)
if processed_facebook_feed != facebook_feed: if processed_facebook_feed != facebook_feed:
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to Facebook feed" % (self.feed.log_title[:30])) logging.debug(
" ---> [%-30s] ~FGApplied encoding correction to Facebook feed"
% (self.feed.log_title[:30])
)
self.fpf = feedparser.parse(processed_facebook_feed) self.fpf = feedparser.parse(processed_facebook_feed)
elif self.feed.is_forbidden: elif self.feed.is_forbidden:
forbidden_feed = self.fetch_forbidden() forbidden_feed = self.fetch_forbidden()
@ -260,7 +269,10 @@ class FetchFeed:
# Apply encoding preprocessing to special feed content # Apply encoding preprocessing to special feed content
processed_forbidden_feed = preprocess_feed_encoding(forbidden_feed) processed_forbidden_feed = preprocess_feed_encoding(forbidden_feed)
if processed_forbidden_feed != forbidden_feed: if processed_forbidden_feed != forbidden_feed:
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to forbidden feed" % (self.feed.log_title[:30])) logging.debug(
" ---> [%-30s] ~FGApplied encoding correction to forbidden feed"
% (self.feed.log_title[:30])
)
self.fpf = feedparser.parse(processed_forbidden_feed) self.fpf = feedparser.parse(processed_forbidden_feed)
if not self.fpf and "json" in address: if not self.fpf and "json" in address:
@ -336,7 +348,10 @@ class FetchFeed:
# Apply encoding preprocessing to JSON feed content # Apply encoding preprocessing to JSON feed content
processed_json_feed = preprocess_feed_encoding(json_feed) processed_json_feed = preprocess_feed_encoding(json_feed)
if processed_json_feed != json_feed: if processed_json_feed != json_feed:
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to JSON feed" % (self.feed.log_title[:30])) logging.debug(
" ---> [%-30s] ~FGApplied encoding correction to JSON feed"
% (self.feed.log_title[:30])
)
self.fpf = feedparser.parse(processed_json_feed) self.fpf = feedparser.parse(processed_json_feed)
elif raw_feed.content and raw_feed.status_code < 400: elif raw_feed.content and raw_feed.status_code < 400:
response_headers = raw_feed.headers response_headers = raw_feed.headers
@ -345,7 +360,10 @@ class FetchFeed:
# Preprocess feed to fix encoding issues before parsing with feedparser # Preprocess feed to fix encoding issues before parsing with feedparser
processed_feed = preprocess_feed_encoding(self.raw_feed) processed_feed = preprocess_feed_encoding(self.raw_feed)
if processed_feed != self.raw_feed: if processed_feed != self.raw_feed:
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to feed with misencoded HTML entities" % (self.feed.log_title[:30])) logging.debug(
" ---> [%-30s] ~FGApplied encoding correction to feed with misencoded HTML entities"
% (self.feed.log_title[:30])
)
self.fpf = feedparser.parse(processed_feed, response_headers=response_headers) self.fpf = feedparser.parse(processed_feed, response_headers=response_headers)
if self.options["verbose"]: if self.options["verbose"]:
logging.debug( logging.debug(
@ -701,7 +719,7 @@ class ProcessFeed:
" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." " ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
% (self.feed.log_title[:30], self.fpf.status) % (self.feed.log_title[:30], self.fpf.status)
) )
if self.fpf.status in 403 and not self.feed.is_forbidden: if self.fpf.status in [403] and not self.feed.is_forbidden:
self.feed = self.feed.set_is_forbidden() self.feed = self.feed.set_is_forbidden()
fixed_feed = None fixed_feed = None
if not self.feed.known_good: if not self.feed.known_good: