mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-31 21:41:33 +00:00
Preprocess feed content to fix Verge specific encoding issues.
This commit is contained in:
parent
16e22348a7
commit
d6f9ef24f8
2 changed files with 97 additions and 8 deletions
|
@ -547,7 +547,7 @@ def save_ios_receipt(request):
|
||||||
transaction_identifier,
|
transaction_identifier,
|
||||||
receipt,
|
receipt,
|
||||||
)
|
)
|
||||||
mail_admins(subject, message)
|
# mail_admins(subject, message)
|
||||||
else:
|
else:
|
||||||
logging.user(
|
logging.user(
|
||||||
request,
|
request,
|
||||||
|
@ -577,7 +577,7 @@ def save_android_receipt(request):
|
||||||
product_id,
|
product_id,
|
||||||
order_id,
|
order_id,
|
||||||
)
|
)
|
||||||
mail_admins(subject, message)
|
# mail_admins(subject, message)
|
||||||
else:
|
else:
|
||||||
logging.user(
|
logging.user(
|
||||||
request, "~BM~FBNot sending Android Receipt email, already paid: %s %s" % (product_id, order_id)
|
request, "~BM~FBNot sending Android Receipt email, already paid: %s %s" % (product_id, order_id)
|
||||||
|
|
|
@ -2,6 +2,7 @@ import datetime
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
|
import html
|
||||||
|
|
||||||
import django
|
import django
|
||||||
|
|
||||||
|
@ -67,6 +68,67 @@ from utils.story_functions import (
|
||||||
from utils.twitter_fetcher import TwitterFetcher
|
from utils.twitter_fetcher import TwitterFetcher
|
||||||
from utils.youtube_fetcher import YoutubeFetcher
|
from utils.youtube_fetcher import YoutubeFetcher
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_feed_encoding(raw_xml):
|
||||||
|
"""
|
||||||
|
Check if the raw XML content contains any misencoded HTML entities that indicate
|
||||||
|
UTF-8 bytes were misinterpreted (e.g., sequences like ’
|
||||||
|
which represent a smart apostrophe).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_xml (str): The raw XML content fetched from the feed
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The corrected XML content with proper encoding
|
||||||
|
"""
|
||||||
|
# Common indicators of misencoded UTF-8
|
||||||
|
misencoding_indicators = [
|
||||||
|
# Common UTF-8 double encoded patterns
|
||||||
|
"’", # Smart apostrophe (')
|
||||||
|
"–", # Em dash (—)
|
||||||
|
"—", # En dash (–)
|
||||||
|
"“", # Opening smart quote (")
|
||||||
|
"”", # Closing smart quote (")
|
||||||
|
"‘", # Single opening quote (')
|
||||||
|
"’", # Single closing quote (')
|
||||||
|
"…", # Ellipsis (…)
|
||||||
|
"†", # Non-breaking space
|
||||||
|
"‰", # Bullet point (•)
|
||||||
|
"‮", # Registered trademark (®)
|
||||||
|
"
", # Copyright (©)
|
||||||
|
|
||||||
|
# Additional patterns that indicate encoding issues
|
||||||
|
"é", # é misencoded
|
||||||
|
"î", # ® misencoded
|
||||||
|
"ö", # ¶ misencoded
|
||||||
|
"ò", # ² misencoded
|
||||||
|
"ð", # ° misencoded
|
||||||
|
"Ž", # ½ misencoded
|
||||||
|
]
|
||||||
|
|
||||||
|
# Check if any of the indicators are present
|
||||||
|
needs_fixing = any(indicator in raw_xml for indicator in misencoding_indicators)
|
||||||
|
|
||||||
|
if needs_fixing:
|
||||||
|
try:
|
||||||
|
# Step 1: HTML Unescaping - convert HTML entities to their literal characters
|
||||||
|
# This will typically produce characters like ’ in place of the intended smart apostrophe
|
||||||
|
unescaped = html.unescape(raw_xml)
|
||||||
|
|
||||||
|
# Step 2: Encoding Reinterpretation
|
||||||
|
# Re-encode as Latin-1/Windows-1252 and decode as UTF-8
|
||||||
|
# This "encoding shuffle" restores the original characters
|
||||||
|
corrected = unescaped.encode('latin1').decode('utf-8', errors='replace')
|
||||||
|
|
||||||
|
return corrected
|
||||||
|
except (UnicodeError, AttributeError) as e:
|
||||||
|
# If there's an error in the encoding correction, log it and return the original
|
||||||
|
logging.debug("Error fixing feed encoding: %s" % str(e))
|
||||||
|
return raw_xml
|
||||||
|
|
||||||
|
# If no indicators are found, return the original XML
|
||||||
|
return raw_xml
|
||||||
|
|
||||||
# from utils.feed_functions import mail_feed_error_to_admin
|
# from utils.feed_functions import mail_feed_error_to_admin
|
||||||
|
|
||||||
|
|
||||||
|
@ -152,7 +214,11 @@ class FetchFeed:
|
||||||
" ***> [%-30s] ~FRYouTube fetch failed: %s." % (self.feed.log_title[:30], address)
|
" ***> [%-30s] ~FRYouTube fetch failed: %s." % (self.feed.log_title[:30], address)
|
||||||
)
|
)
|
||||||
return FEED_ERRHTTP, None
|
return FEED_ERRHTTP, None
|
||||||
self.fpf = feedparser.parse(youtube_feed, sanitize_html=False)
|
# Apply encoding preprocessing to special feed content
|
||||||
|
processed_youtube_feed = preprocess_feed_encoding(youtube_feed)
|
||||||
|
if processed_youtube_feed != youtube_feed:
|
||||||
|
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to YouTube feed" % (self.feed.log_title[:30]))
|
||||||
|
self.fpf = feedparser.parse(processed_youtube_feed, sanitize_html=False)
|
||||||
elif re.match(r"(https?)?://twitter.com/\w+/?", qurl(address, remove=["_"])):
|
elif re.match(r"(https?)?://twitter.com/\w+/?", qurl(address, remove=["_"])):
|
||||||
twitter_feed = self.fetch_twitter(address)
|
twitter_feed = self.fetch_twitter(address)
|
||||||
if not twitter_feed:
|
if not twitter_feed:
|
||||||
|
@ -160,7 +226,11 @@ class FetchFeed:
|
||||||
" ***> [%-30s] ~FRTwitter fetch failed: %s" % (self.feed.log_title[:30], address)
|
" ***> [%-30s] ~FRTwitter fetch failed: %s" % (self.feed.log_title[:30], address)
|
||||||
)
|
)
|
||||||
return FEED_ERRHTTP, None
|
return FEED_ERRHTTP, None
|
||||||
self.fpf = feedparser.parse(twitter_feed)
|
# Apply encoding preprocessing to special feed content
|
||||||
|
processed_twitter_feed = preprocess_feed_encoding(twitter_feed)
|
||||||
|
if processed_twitter_feed != twitter_feed:
|
||||||
|
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to Twitter feed" % (self.feed.log_title[:30]))
|
||||||
|
self.fpf = feedparser.parse(processed_twitter_feed)
|
||||||
elif re.match(r"(.*?)facebook.com/\w+/?$", qurl(address, remove=["_"])):
|
elif re.match(r"(.*?)facebook.com/\w+/?$", qurl(address, remove=["_"])):
|
||||||
facebook_feed = self.fetch_facebook()
|
facebook_feed = self.fetch_facebook()
|
||||||
if not facebook_feed:
|
if not facebook_feed:
|
||||||
|
@ -168,7 +238,11 @@ class FetchFeed:
|
||||||
" ***> [%-30s] ~FRFacebook fetch failed: %s" % (self.feed.log_title[:30], address)
|
" ***> [%-30s] ~FRFacebook fetch failed: %s" % (self.feed.log_title[:30], address)
|
||||||
)
|
)
|
||||||
return FEED_ERRHTTP, None
|
return FEED_ERRHTTP, None
|
||||||
self.fpf = feedparser.parse(facebook_feed)
|
# Apply encoding preprocessing to special feed content
|
||||||
|
processed_facebook_feed = preprocess_feed_encoding(facebook_feed)
|
||||||
|
if processed_facebook_feed != facebook_feed:
|
||||||
|
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to Facebook feed" % (self.feed.log_title[:30]))
|
||||||
|
self.fpf = feedparser.parse(processed_facebook_feed)
|
||||||
elif self.feed.is_forbidden:
|
elif self.feed.is_forbidden:
|
||||||
forbidden_feed = self.fetch_forbidden()
|
forbidden_feed = self.fetch_forbidden()
|
||||||
if not forbidden_feed:
|
if not forbidden_feed:
|
||||||
|
@ -176,7 +250,11 @@ class FetchFeed:
|
||||||
" ***> [%-30s] ~FRForbidden feed fetch failed: %s" % (self.feed.log_title[:30], address)
|
" ***> [%-30s] ~FRForbidden feed fetch failed: %s" % (self.feed.log_title[:30], address)
|
||||||
)
|
)
|
||||||
return FEED_ERRHTTP, None
|
return FEED_ERRHTTP, None
|
||||||
self.fpf = feedparser.parse(forbidden_feed)
|
# Apply encoding preprocessing to special feed content
|
||||||
|
processed_forbidden_feed = preprocess_feed_encoding(forbidden_feed)
|
||||||
|
if processed_forbidden_feed != forbidden_feed:
|
||||||
|
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to forbidden feed" % (self.feed.log_title[:30]))
|
||||||
|
self.fpf = feedparser.parse(processed_forbidden_feed)
|
||||||
|
|
||||||
if not self.fpf and "json" in address:
|
if not self.fpf and "json" in address:
|
||||||
try:
|
try:
|
||||||
|
@ -248,12 +326,20 @@ class FetchFeed:
|
||||||
" ***> [%-30s] ~FRJSON fetch failed: %s" % (self.feed.log_title[:30], address)
|
" ***> [%-30s] ~FRJSON fetch failed: %s" % (self.feed.log_title[:30], address)
|
||||||
)
|
)
|
||||||
return FEED_ERRHTTP, None
|
return FEED_ERRHTTP, None
|
||||||
self.fpf = feedparser.parse(json_feed)
|
# Apply encoding preprocessing to JSON feed content
|
||||||
|
processed_json_feed = preprocess_feed_encoding(json_feed)
|
||||||
|
if processed_json_feed != json_feed:
|
||||||
|
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to JSON feed" % (self.feed.log_title[:30]))
|
||||||
|
self.fpf = feedparser.parse(processed_json_feed)
|
||||||
elif raw_feed.content and raw_feed.status_code < 400:
|
elif raw_feed.content and raw_feed.status_code < 400:
|
||||||
response_headers = raw_feed.headers
|
response_headers = raw_feed.headers
|
||||||
response_headers["Content-Location"] = raw_feed.url
|
response_headers["Content-Location"] = raw_feed.url
|
||||||
self.raw_feed = smart_str(raw_feed.content)
|
self.raw_feed = smart_str(raw_feed.content)
|
||||||
self.fpf = feedparser.parse(self.raw_feed, response_headers=response_headers)
|
# Preprocess feed to fix encoding issues before parsing with feedparser
|
||||||
|
processed_feed = preprocess_feed_encoding(self.raw_feed)
|
||||||
|
if processed_feed != self.raw_feed:
|
||||||
|
logging.debug(" ---> [%-30s] ~FGApplied encoding correction to feed with misencoded HTML entities" % (self.feed.log_title[:30]))
|
||||||
|
self.fpf = feedparser.parse(processed_feed, response_headers=response_headers)
|
||||||
if self.options["verbose"]:
|
if self.options["verbose"]:
|
||||||
logging.debug(
|
logging.debug(
|
||||||
" ---> [%-30s] ~FBFeed fetch status %s: %s length / %s"
|
" ---> [%-30s] ~FBFeed fetch status %s: %s length / %s"
|
||||||
|
@ -273,6 +359,8 @@ class FetchFeed:
|
||||||
|
|
||||||
if not self.fpf or self.options.get("force_fp", False):
|
if not self.fpf or self.options.get("force_fp", False):
|
||||||
try:
|
try:
|
||||||
|
# When feedparser fetches the URL itself, we cannot preprocess the content first
|
||||||
|
# We'll have to rely on feedparser's built-in handling here
|
||||||
self.fpf = feedparser.parse(address, agent=self.feed.user_agent, etag=etag, modified=modified)
|
self.fpf = feedparser.parse(address, agent=self.feed.user_agent, etag=etag, modified=modified)
|
||||||
except (
|
except (
|
||||||
TypeError,
|
TypeError,
|
||||||
|
@ -295,6 +383,7 @@ class FetchFeed:
|
||||||
logging.debug(
|
logging.debug(
|
||||||
" ***> [%-30s] ~FRTurning off headers: %s" % (self.feed.log_title[:30], address)
|
" ***> [%-30s] ~FRTurning off headers: %s" % (self.feed.log_title[:30], address)
|
||||||
)
|
)
|
||||||
|
# Another direct URL fetch that bypasses our preprocessing
|
||||||
self.fpf = feedparser.parse(address, agent=self.feed.user_agent)
|
self.fpf = feedparser.parse(address, agent=self.feed.user_agent)
|
||||||
except (
|
except (
|
||||||
TypeError,
|
TypeError,
|
||||||
|
|
Loading…
Add table
Reference in a new issue