diff --git a/apps/profile/views.py b/apps/profile/views.py index bf803066b..b1c91274e 100644 --- a/apps/profile/views.py +++ b/apps/profile/views.py @@ -547,7 +547,7 @@ def save_ios_receipt(request): transaction_identifier, receipt, ) - mail_admins(subject, message) + # mail_admins(subject, message) else: logging.user( request, @@ -577,7 +577,7 @@ def save_android_receipt(request): product_id, order_id, ) - mail_admins(subject, message) + # mail_admins(subject, message) else: logging.user( request, "~BM~FBNot sending Android Receipt email, already paid: %s %s" % (product_id, order_id) diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index 0f9696afd..787a98ed7 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -2,6 +2,7 @@ import datetime import multiprocessing import time import traceback +import html import django @@ -67,6 +68,67 @@ from utils.story_functions import ( from utils.twitter_fetcher import TwitterFetcher from utils.youtube_fetcher import YoutubeFetcher + +def preprocess_feed_encoding(raw_xml): + """ + Check if the raw XML content contains any misencoded HTML entities that indicate + UTF-8 bytes were misinterpreted (e.g., sequences like ’ + which represent a smart apostrophe). + + Args: + raw_xml (str): The raw XML content fetched from the feed + + Returns: + str: The corrected XML content with proper encoding + """ + # Common indicators of misencoded UTF-8 + misencoding_indicators = [ + # Common UTF-8 double encoded patterns + "’", # Smart apostrophe (') + "–", # Em dash (—) + "—", # En dash (–) + "“", # Opening smart quote (") + "”", # Closing smart quote (") + "‘", # Single opening quote (') + "’", # Single closing quote (') + "…", # Ellipsis (…) + "†", # Non-breaking space + "‰", # Bullet point (•) + "‮", # Registered trademark (®) + "
", # Copyright (©) + + # Additional patterns that indicate encoding issues + "é", # é misencoded + "î", # ® misencoded + "ö", # ¶ misencoded + "ò", # ² misencoded + "ð", # ° misencoded + "Ž", # ½ misencoded + ] + + # Check if any of the indicators are present + needs_fixing = any(indicator in raw_xml for indicator in misencoding_indicators) + + if needs_fixing: + try: + # Step 1: HTML Unescaping - convert HTML entities to their literal characters + # This will typically produce characters like ’ in place of the intended smart apostrophe + unescaped = html.unescape(raw_xml) + + # Step 2: Encoding Reinterpretation + # Re-encode as Latin-1/Windows-1252 and decode as UTF-8 + # This "encoding shuffle" restores the original characters + corrected = unescaped.encode('latin1').decode('utf-8', errors='replace') + + return corrected + except (UnicodeError, AttributeError) as e: + # If there's an error in the encoding correction, log it and return the original + logging.debug("Error fixing feed encoding: %s" % str(e)) + return raw_xml + + # If no indicators are found, return the original XML + return raw_xml + # from utils.feed_functions import mail_feed_error_to_admin @@ -152,7 +214,11 @@ class FetchFeed: " ***> [%-30s] ~FRYouTube fetch failed: %s." % (self.feed.log_title[:30], address) ) return FEED_ERRHTTP, None - self.fpf = feedparser.parse(youtube_feed, sanitize_html=False) + # Apply encoding preprocessing to special feed content + processed_youtube_feed = preprocess_feed_encoding(youtube_feed) + if processed_youtube_feed != youtube_feed: + logging.debug(" ---> [%-30s] ~FGApplied encoding correction to YouTube feed" % (self.feed.log_title[:30])) + self.fpf = feedparser.parse(processed_youtube_feed, sanitize_html=False) elif re.match(r"(https?)?://twitter.com/\w+/?", qurl(address, remove=["_"])): twitter_feed = self.fetch_twitter(address) if not twitter_feed: @@ -160,7 +226,11 @@ class FetchFeed: " ***> [%-30s] ~FRTwitter fetch failed: %s" % (self.feed.log_title[:30], address) ) return FEED_ERRHTTP, None - self.fpf = feedparser.parse(twitter_feed) + # Apply encoding preprocessing to special feed content + processed_twitter_feed = preprocess_feed_encoding(twitter_feed) + if processed_twitter_feed != twitter_feed: + logging.debug(" ---> [%-30s] ~FGApplied encoding correction to Twitter feed" % (self.feed.log_title[:30])) + self.fpf = feedparser.parse(processed_twitter_feed) elif re.match(r"(.*?)facebook.com/\w+/?$", qurl(address, remove=["_"])): facebook_feed = self.fetch_facebook() if not facebook_feed: @@ -168,7 +238,11 @@ class FetchFeed: " ***> [%-30s] ~FRFacebook fetch failed: %s" % (self.feed.log_title[:30], address) ) return FEED_ERRHTTP, None - self.fpf = feedparser.parse(facebook_feed) + # Apply encoding preprocessing to special feed content + processed_facebook_feed = preprocess_feed_encoding(facebook_feed) + if processed_facebook_feed != facebook_feed: + logging.debug(" ---> [%-30s] ~FGApplied encoding correction to Facebook feed" % (self.feed.log_title[:30])) + self.fpf = feedparser.parse(processed_facebook_feed) elif self.feed.is_forbidden: forbidden_feed = self.fetch_forbidden() if not forbidden_feed: @@ -176,7 +250,11 @@ class FetchFeed: " ***> [%-30s] ~FRForbidden feed fetch failed: %s" % (self.feed.log_title[:30], address) ) return FEED_ERRHTTP, None - self.fpf = feedparser.parse(forbidden_feed) + # Apply encoding preprocessing to special feed content + processed_forbidden_feed = preprocess_feed_encoding(forbidden_feed) + if processed_forbidden_feed != forbidden_feed: + logging.debug(" ---> [%-30s] ~FGApplied encoding correction to forbidden feed" % (self.feed.log_title[:30])) + self.fpf = feedparser.parse(processed_forbidden_feed) if not self.fpf and "json" in address: try: @@ -248,12 +326,20 @@ class FetchFeed: " ***> [%-30s] ~FRJSON fetch failed: %s" % (self.feed.log_title[:30], address) ) return FEED_ERRHTTP, None - self.fpf = feedparser.parse(json_feed) + # Apply encoding preprocessing to JSON feed content + processed_json_feed = preprocess_feed_encoding(json_feed) + if processed_json_feed != json_feed: + logging.debug(" ---> [%-30s] ~FGApplied encoding correction to JSON feed" % (self.feed.log_title[:30])) + self.fpf = feedparser.parse(processed_json_feed) elif raw_feed.content and raw_feed.status_code < 400: response_headers = raw_feed.headers response_headers["Content-Location"] = raw_feed.url self.raw_feed = smart_str(raw_feed.content) - self.fpf = feedparser.parse(self.raw_feed, response_headers=response_headers) + # Preprocess feed to fix encoding issues before parsing with feedparser + processed_feed = preprocess_feed_encoding(self.raw_feed) + if processed_feed != self.raw_feed: + logging.debug(" ---> [%-30s] ~FGApplied encoding correction to feed with misencoded HTML entities" % (self.feed.log_title[:30])) + self.fpf = feedparser.parse(processed_feed, response_headers=response_headers) if self.options["verbose"]: logging.debug( " ---> [%-30s] ~FBFeed fetch status %s: %s length / %s" @@ -273,6 +359,8 @@ class FetchFeed: if not self.fpf or self.options.get("force_fp", False): try: + # When feedparser fetches the URL itself, we cannot preprocess the content first + # We'll have to rely on feedparser's built-in handling here self.fpf = feedparser.parse(address, agent=self.feed.user_agent, etag=etag, modified=modified) except ( TypeError, @@ -295,6 +383,7 @@ class FetchFeed: logging.debug( " ***> [%-30s] ~FRTurning off headers: %s" % (self.feed.log_title[:30], address) ) + # Another direct URL fetch that bypasses our preprocessing self.fpf = feedparser.parse(address, agent=self.feed.user_agent) except ( TypeError,