Fixing page and text importer to correctly handling non-breaking spaces.

2025-09-18 21:50:56 +00:00 · 2016-12-05 17:40:39 -08:00 · 2016-12-05 17:40:39 -08:00 · 3ed96e338c
commit 3ed96e338c
parent 09570b1ab6
2 changed files with 17 additions and 1 deletions
--- a/apps/rss_feeds/page_importer.py
+++ b/apps/rss_feeds/page_importer.py
@ -197,6 +197,8 @@ class PageImporter(object):
                pass

        if data:
+            data = data.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
+            data = data.replace("\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
            html = self.rewrite_page(data)
            self.save_story(html)
        
--- a/apps/rss_feeds/text_importer.py
+++ b/apps/rss_feeds/text_importer.py
@ -56,7 +56,21 @@ class TextImporter:
        if not resp:
            return

-        text = resp.text
+        try:
+            text = resp.text
+        except (LookupError, TypeError):
+            text = resp.content
+
+        if resp.encoding and resp.encoding != 'utf-8':
+            try:
+                text = text.encode(resp.encoding)
+            except LookupError:
+                pass
+
+        if text:
+            text = text.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
+            text = text.replace("\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
+
        original_text_doc = readability.Document(text, url=resp.url,
                                                 debug=self.debug,
                                                 positive_keywords=["postContent", "postField"])