Preprocess feed content to fix Verge specific encoding issues.

2025-08-31 21:41:33 +00:00 · 2025-04-05 13:48:39 -07:00 · 2025-04-05 13:48:39 -07:00 · d6f9ef24f8
commit d6f9ef24f8
parent 16e22348a7
2 changed files with 97 additions and 8 deletions
--- a/apps/profile/views.py
+++ b/apps/profile/views.py
@ -547,7 +547,7 @@ def save_ios_receipt(request):
            transaction_identifier,
            receipt,
        )
-        mail_admins(subject, message)
+        # mail_admins(subject, message)
    else:
        logging.user(
            request,
@ -577,7 +577,7 @@ def save_android_receipt(request):
            product_id,
            order_id,
        )
-        mail_admins(subject, message)
+        # mail_admins(subject, message)
    else:
        logging.user(
            request, "~BM~FBNot sending Android Receipt email, already paid: %s %s" % (product_id, order_id)
--- a/utils/feed_fetcher.py
+++ b/utils/feed_fetcher.py
@ -2,6 +2,7 @@ import datetime
 import multiprocessing
 import time
 import traceback
 import html
 import django
@ -67,6 +68,67 @@ from utils.story_functions import (
 from utils.twitter_fetcher import TwitterFetcher
 from utils.youtube_fetcher import YoutubeFetcher
 def preprocess_feed_encoding(raw_xml):
    """
    Check if the raw XML content contains any misencoded HTML entities that indicate
    UTF-8 bytes were misinterpreted (e.g., sequences like &acirc;&#128;&#153; 
    which represent a smart apostrophe).
    Args:
        raw_xml (str): The raw XML content fetched from the feed
    Returns:
        str: The corrected XML content with proper encoding
    """
    # Common indicators of misencoded UTF-8
    misencoding_indicators = [
        # Common UTF-8 double encoded patterns
        "&acirc;&#128;&#153;",  # Smart apostrophe (')
        "&acirc;&#128;&#147;",  # Em dash (—)
        "&acirc;&#128;&#148;",  # En dash (–)
        "&acirc;&#128;&#156;",  # Opening smart quote (")
        "&acirc;&#128;&#157;",  # Closing smart quote (")
        "&acirc;&#128;&#152;",  # Single opening quote (')
        "&acirc;&#128;&#153;",  # Single closing quote (')
        "&acirc;&#128;&#166;",  # Ellipsis (…)
        "&acirc;&#128;&#160;",  # Non-breaking space
        "&acirc;&#128;&#176;",  # Bullet point (•)
        "&acirc;&#128;&#174;",  # Registered trademark (®)
        "&acirc;&#128;&#169;",  # Copyright (©)
        # Additional patterns that indicate encoding issues 
        "&Atilde;&copy;",       # é misencoded
        "&Atilde;&reg;",        # ® misencoded
        "&Atilde;&para;",       # ¶ misencoded
        "&Atilde;&sup2;",       # ² misencoded
        "&Atilde;&deg;",        # ° misencoded
        "&Aring;&frac12;",      # ½ misencoded
    ]
    # Check if any of the indicators are present
    needs_fixing = any(indicator in raw_xml for indicator in misencoding_indicators)
    if needs_fixing:
        try:
            # Step 1: HTML Unescaping - convert HTML entities to their literal characters
            # This will typically produce characters like â€™ in place of the intended smart apostrophe
            unescaped = html.unescape(raw_xml)
            # Step 2: Encoding Reinterpretation
            # Re-encode as Latin-1/Windows-1252 and decode as UTF-8
            # This "encoding shuffle" restores the original characters
            corrected = unescaped.encode('latin1').decode('utf-8', errors='replace')
            return corrected
        except (UnicodeError, AttributeError) as e:
            # If there's an error in the encoding correction, log it and return the original
            logging.debug("Error fixing feed encoding: %s" % str(e))
            return raw_xml
    # If no indicators are found, return the original XML
    return raw_xml
 # from utils.feed_functions import mail_feed_error_to_admin
@ -152,7 +214,11 @@ class FetchFeed:
                    "   ***> [%-30s] ~FRYouTube fetch failed: %s." % (self.feed.log_title[:30], address)
                )
                return FEED_ERRHTTP, None
-            self.fpf = feedparser.parse(youtube_feed, sanitize_html=False)
+            # Apply encoding preprocessing to special feed content
            processed_youtube_feed = preprocess_feed_encoding(youtube_feed)
            if processed_youtube_feed != youtube_feed:
                logging.debug("   ---> [%-30s] ~FGApplied encoding correction to YouTube feed" % (self.feed.log_title[:30]))
            self.fpf = feedparser.parse(processed_youtube_feed, sanitize_html=False)
        elif re.match(r"(https?)?://twitter.com/\w+/?", qurl(address, remove=["_"])):
            twitter_feed = self.fetch_twitter(address)
            if not twitter_feed:
@ -160,7 +226,11 @@ class FetchFeed:
                    "   ***> [%-30s] ~FRTwitter fetch failed: %s" % (self.feed.log_title[:30], address)
                )
                return FEED_ERRHTTP, None
-            self.fpf = feedparser.parse(twitter_feed)
+            # Apply encoding preprocessing to special feed content
            processed_twitter_feed = preprocess_feed_encoding(twitter_feed)
            if processed_twitter_feed != twitter_feed:
                logging.debug("   ---> [%-30s] ~FGApplied encoding correction to Twitter feed" % (self.feed.log_title[:30]))
            self.fpf = feedparser.parse(processed_twitter_feed)
        elif re.match(r"(.*?)facebook.com/\w+/?$", qurl(address, remove=["_"])):
            facebook_feed = self.fetch_facebook()
            if not facebook_feed:
@ -168,7 +238,11 @@ class FetchFeed:
                    "   ***> [%-30s] ~FRFacebook fetch failed: %s" % (self.feed.log_title[:30], address)
                )
                return FEED_ERRHTTP, None
-            self.fpf = feedparser.parse(facebook_feed)
+            # Apply encoding preprocessing to special feed content
            processed_facebook_feed = preprocess_feed_encoding(facebook_feed)
            if processed_facebook_feed != facebook_feed:
                logging.debug("   ---> [%-30s] ~FGApplied encoding correction to Facebook feed" % (self.feed.log_title[:30]))
            self.fpf = feedparser.parse(processed_facebook_feed)
        elif self.feed.is_forbidden:
            forbidden_feed = self.fetch_forbidden()
            if not forbidden_feed:
@ -176,7 +250,11 @@ class FetchFeed:
                    "   ***> [%-30s] ~FRForbidden feed fetch failed: %s" % (self.feed.log_title[:30], address)
                )
                return FEED_ERRHTTP, None
-            self.fpf = feedparser.parse(forbidden_feed)
+            # Apply encoding preprocessing to special feed content
            processed_forbidden_feed = preprocess_feed_encoding(forbidden_feed)
            if processed_forbidden_feed != forbidden_feed:
                logging.debug("   ---> [%-30s] ~FGApplied encoding correction to forbidden feed" % (self.feed.log_title[:30]))
            self.fpf = feedparser.parse(processed_forbidden_feed)
        if not self.fpf and "json" in address:
            try:
@ -248,12 +326,20 @@ class FetchFeed:
                            "   ***> [%-30s] ~FRJSON fetch failed: %s" % (self.feed.log_title[:30], address)
                        )
                        return FEED_ERRHTTP, None
-                    self.fpf = feedparser.parse(json_feed)
+                    # Apply encoding preprocessing to JSON feed content
                    processed_json_feed = preprocess_feed_encoding(json_feed)
                    if processed_json_feed != json_feed:
                        logging.debug("   ---> [%-30s] ~FGApplied encoding correction to JSON feed" % (self.feed.log_title[:30]))
                    self.fpf = feedparser.parse(processed_json_feed)
                elif raw_feed.content and raw_feed.status_code < 400:
                    response_headers = raw_feed.headers
                    response_headers["Content-Location"] = raw_feed.url
                    self.raw_feed = smart_str(raw_feed.content)
-                    self.fpf = feedparser.parse(self.raw_feed, response_headers=response_headers)
+                    # Preprocess feed to fix encoding issues before parsing with feedparser
                    processed_feed = preprocess_feed_encoding(self.raw_feed)
                    if processed_feed != self.raw_feed:
                        logging.debug("   ---> [%-30s] ~FGApplied encoding correction to feed with misencoded HTML entities" % (self.feed.log_title[:30]))
                    self.fpf = feedparser.parse(processed_feed, response_headers=response_headers)
                    if self.options["verbose"]:
                        logging.debug(
                            " ---> [%-30s] ~FBFeed fetch status %s: %s length / %s"
@ -273,6 +359,8 @@ class FetchFeed:
        if not self.fpf or self.options.get("force_fp", False):
            try:
                # When feedparser fetches the URL itself, we cannot preprocess the content first
                # We'll have to rely on feedparser's built-in handling here
                self.fpf = feedparser.parse(address, agent=self.feed.user_agent, etag=etag, modified=modified)
            except (
                TypeError,
@ -295,6 +383,7 @@ class FetchFeed:
                logging.debug(
                    "   ***> [%-30s] ~FRTurning off headers: %s" % (self.feed.log_title[:30], address)
                )
                # Another direct URL fetch that bypasses our preprocessing
                self.fpf = feedparser.parse(address, agent=self.feed.user_agent)
            except (
                TypeError,