Fixing page and text importer to correctly handling non-breaking spaces.

This commit is contained in:
Samuel Clay 2016-12-05 17:40:39 -08:00
parent 09570b1ab6
commit 3ed96e338c
2 changed files with 17 additions and 1 deletions

View file

@ -197,6 +197,8 @@ class PageImporter(object):
pass
if data:
data = data.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
data = data.replace("\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
html = self.rewrite_page(data)
self.save_story(html)

View file

@ -56,7 +56,21 @@ class TextImporter:
if not resp:
return
text = resp.text
try:
text = resp.text
except (LookupError, TypeError):
text = resp.content
if resp.encoding and resp.encoding != 'utf-8':
try:
text = text.encode(resp.encoding)
except LookupError:
pass
if text:
text = text.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
text = text.replace("\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
original_text_doc = readability.Document(text, url=resp.url,
debug=self.debug,
positive_keywords=["postContent", "postField"])