mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-04-13 09:42:01 +00:00
Fixing broken 403 check
This commit is contained in:
parent
a91fd46abc
commit
97d1e12ad3
1 changed files with 44 additions and 26 deletions
|
@ -1,8 +1,8 @@
|
||||||
import datetime
|
import datetime
|
||||||
|
import html
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
import html
|
|
||||||
|
|
||||||
import django
|
import django
|
||||||
|
|
||||||
|
@ -103,7 +103,6 @@ def preprocess_feed_encoding(raw_xml):
|
||||||
"‰", # Bullet point (•)
|
"‰", # Bullet point (•)
|
||||||
"‮", # Registered trademark (®)
|
"‮", # Registered trademark (®)
|
||||||
"
", # Copyright (©)
|
"
", # Copyright (©)
|
||||||
|
|
||||||
# Additional patterns that indicate encoding issues
|
# Additional patterns that indicate encoding issues
|
||||||
"é", # é misencoded
|
"é", # é misencoded
|
||||||
"î", # ® misencoded
|
"î", # ® misencoded
|
||||||
|
@ -125,7 +124,7 @@ def preprocess_feed_encoding(raw_xml):
|
||||||
# Step 2: Encoding Reinterpretation
|
# Step 2: Encoding Reinterpretation
|
||||||
# Re-encode as Latin-1/Windows-1252 and decode as UTF-8
|
# Re-encode as Latin-1/Windows-1252 and decode as UTF-8
|
||||||
# This "encoding shuffle" restores the original characters
|
# This "encoding shuffle" restores the original characters
|
||||||
corrected = unescaped.encode('latin1').decode('utf-8', errors='replace')
|
corrected = unescaped.encode("latin1").decode("utf-8", errors="replace")
|
||||||
|
|
||||||
return corrected
|
return corrected
|
||||||
except (UnicodeError, AttributeError) as e:
|
except (UnicodeError, AttributeError) as e:
|
||||||
|
@ -136,6 +135,7 @@ def preprocess_feed_encoding(raw_xml):
|
||||||
# If no indicators are found, return the original XML
|
# If no indicators are found, return the original XML
|
||||||
return raw_xml
|
return raw_xml
|
||||||
|
|
||||||
|
|
||||||
# from utils.feed_functions import mail_feed_error_to_admin
|
# from utils.feed_functions import mail_feed_error_to_admin
|
||||||
|
|
||||||
|
|
||||||
|
@ -224,7 +224,10 @@ class FetchFeed:
|
||||||
# Apply encoding preprocessing to special feed content
|
# Apply encoding preprocessing to special feed content
|
||||||
processed_youtube_feed = preprocess_feed_encoding(youtube_feed)
|
processed_youtube_feed = preprocess_feed_encoding(youtube_feed)
|
||||||
if processed_youtube_feed != youtube_feed:
|
if processed_youtube_feed != youtube_feed:
|
||||||
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to YouTube feed" % (self.feed.log_title[:30]))
|
logging.debug(
|
||||||
|
" ---> [%-30s] ~FGApplied encoding correction to YouTube feed"
|
||||||
|
% (self.feed.log_title[:30])
|
||||||
|
)
|
||||||
self.fpf = feedparser.parse(processed_youtube_feed, sanitize_html=False)
|
self.fpf = feedparser.parse(processed_youtube_feed, sanitize_html=False)
|
||||||
elif re.match(r"(https?)?://twitter.com/\w+/?", qurl(address, remove=["_"])):
|
elif re.match(r"(https?)?://twitter.com/\w+/?", qurl(address, remove=["_"])):
|
||||||
twitter_feed = self.fetch_twitter(address)
|
twitter_feed = self.fetch_twitter(address)
|
||||||
|
@ -236,7 +239,10 @@ class FetchFeed:
|
||||||
# Apply encoding preprocessing to special feed content
|
# Apply encoding preprocessing to special feed content
|
||||||
processed_twitter_feed = preprocess_feed_encoding(twitter_feed)
|
processed_twitter_feed = preprocess_feed_encoding(twitter_feed)
|
||||||
if processed_twitter_feed != twitter_feed:
|
if processed_twitter_feed != twitter_feed:
|
||||||
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to Twitter feed" % (self.feed.log_title[:30]))
|
logging.debug(
|
||||||
|
" ---> [%-30s] ~FGApplied encoding correction to Twitter feed"
|
||||||
|
% (self.feed.log_title[:30])
|
||||||
|
)
|
||||||
self.fpf = feedparser.parse(processed_twitter_feed)
|
self.fpf = feedparser.parse(processed_twitter_feed)
|
||||||
elif re.match(r"(.*?)facebook.com/\w+/?$", qurl(address, remove=["_"])):
|
elif re.match(r"(.*?)facebook.com/\w+/?$", qurl(address, remove=["_"])):
|
||||||
facebook_feed = self.fetch_facebook()
|
facebook_feed = self.fetch_facebook()
|
||||||
|
@ -248,7 +254,10 @@ class FetchFeed:
|
||||||
# Apply encoding preprocessing to special feed content
|
# Apply encoding preprocessing to special feed content
|
||||||
processed_facebook_feed = preprocess_feed_encoding(facebook_feed)
|
processed_facebook_feed = preprocess_feed_encoding(facebook_feed)
|
||||||
if processed_facebook_feed != facebook_feed:
|
if processed_facebook_feed != facebook_feed:
|
||||||
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to Facebook feed" % (self.feed.log_title[:30]))
|
logging.debug(
|
||||||
|
" ---> [%-30s] ~FGApplied encoding correction to Facebook feed"
|
||||||
|
% (self.feed.log_title[:30])
|
||||||
|
)
|
||||||
self.fpf = feedparser.parse(processed_facebook_feed)
|
self.fpf = feedparser.parse(processed_facebook_feed)
|
||||||
elif self.feed.is_forbidden:
|
elif self.feed.is_forbidden:
|
||||||
forbidden_feed = self.fetch_forbidden()
|
forbidden_feed = self.fetch_forbidden()
|
||||||
|
@ -260,7 +269,10 @@ class FetchFeed:
|
||||||
# Apply encoding preprocessing to special feed content
|
# Apply encoding preprocessing to special feed content
|
||||||
processed_forbidden_feed = preprocess_feed_encoding(forbidden_feed)
|
processed_forbidden_feed = preprocess_feed_encoding(forbidden_feed)
|
||||||
if processed_forbidden_feed != forbidden_feed:
|
if processed_forbidden_feed != forbidden_feed:
|
||||||
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to forbidden feed" % (self.feed.log_title[:30]))
|
logging.debug(
|
||||||
|
" ---> [%-30s] ~FGApplied encoding correction to forbidden feed"
|
||||||
|
% (self.feed.log_title[:30])
|
||||||
|
)
|
||||||
self.fpf = feedparser.parse(processed_forbidden_feed)
|
self.fpf = feedparser.parse(processed_forbidden_feed)
|
||||||
|
|
||||||
if not self.fpf and "json" in address:
|
if not self.fpf and "json" in address:
|
||||||
|
@ -336,7 +348,10 @@ class FetchFeed:
|
||||||
# Apply encoding preprocessing to JSON feed content
|
# Apply encoding preprocessing to JSON feed content
|
||||||
processed_json_feed = preprocess_feed_encoding(json_feed)
|
processed_json_feed = preprocess_feed_encoding(json_feed)
|
||||||
if processed_json_feed != json_feed:
|
if processed_json_feed != json_feed:
|
||||||
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to JSON feed" % (self.feed.log_title[:30]))
|
logging.debug(
|
||||||
|
" ---> [%-30s] ~FGApplied encoding correction to JSON feed"
|
||||||
|
% (self.feed.log_title[:30])
|
||||||
|
)
|
||||||
self.fpf = feedparser.parse(processed_json_feed)
|
self.fpf = feedparser.parse(processed_json_feed)
|
||||||
elif raw_feed.content and raw_feed.status_code < 400:
|
elif raw_feed.content and raw_feed.status_code < 400:
|
||||||
response_headers = raw_feed.headers
|
response_headers = raw_feed.headers
|
||||||
|
@ -345,7 +360,10 @@ class FetchFeed:
|
||||||
# Preprocess feed to fix encoding issues before parsing with feedparser
|
# Preprocess feed to fix encoding issues before parsing with feedparser
|
||||||
processed_feed = preprocess_feed_encoding(self.raw_feed)
|
processed_feed = preprocess_feed_encoding(self.raw_feed)
|
||||||
if processed_feed != self.raw_feed:
|
if processed_feed != self.raw_feed:
|
||||||
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to feed with misencoded HTML entities" % (self.feed.log_title[:30]))
|
logging.debug(
|
||||||
|
" ---> [%-30s] ~FGApplied encoding correction to feed with misencoded HTML entities"
|
||||||
|
% (self.feed.log_title[:30])
|
||||||
|
)
|
||||||
self.fpf = feedparser.parse(processed_feed, response_headers=response_headers)
|
self.fpf = feedparser.parse(processed_feed, response_headers=response_headers)
|
||||||
if self.options["verbose"]:
|
if self.options["verbose"]:
|
||||||
logging.debug(
|
logging.debug(
|
||||||
|
@ -701,7 +719,7 @@ class ProcessFeed:
|
||||||
" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
|
" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
|
||||||
% (self.feed.log_title[:30], self.fpf.status)
|
% (self.feed.log_title[:30], self.fpf.status)
|
||||||
)
|
)
|
||||||
if self.fpf.status in 403 and not self.feed.is_forbidden:
|
if self.fpf.status in [403] and not self.feed.is_forbidden:
|
||||||
self.feed = self.feed.set_is_forbidden()
|
self.feed = self.feed.set_is_forbidden()
|
||||||
fixed_feed = None
|
fixed_feed = None
|
||||||
if not self.feed.known_good:
|
if not self.feed.known_good:
|
||||||
|
|
Loading…
Add table
Reference in a new issue