mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-09-18 21:50:56 +00:00
Fixing page and text importer to correctly handling non-breaking spaces.
This commit is contained in:
parent
09570b1ab6
commit
3ed96e338c
2 changed files with 17 additions and 1 deletions
|
@ -197,6 +197,8 @@ class PageImporter(object):
|
|||
pass
|
||||
|
||||
if data:
|
||||
data = data.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
|
||||
data = data.replace("\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
|
||||
html = self.rewrite_page(data)
|
||||
self.save_story(html)
|
||||
|
||||
|
|
|
@ -56,7 +56,21 @@ class TextImporter:
|
|||
if not resp:
|
||||
return
|
||||
|
||||
text = resp.text
|
||||
try:
|
||||
text = resp.text
|
||||
except (LookupError, TypeError):
|
||||
text = resp.content
|
||||
|
||||
if resp.encoding and resp.encoding != 'utf-8':
|
||||
try:
|
||||
text = text.encode(resp.encoding)
|
||||
except LookupError:
|
||||
pass
|
||||
|
||||
if text:
|
||||
text = text.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
|
||||
text = text.replace("\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
|
||||
|
||||
original_text_doc = readability.Document(text, url=resp.url,
|
||||
debug=self.debug,
|
||||
positive_keywords=["postContent", "postField"])
|
||||
|
|
Loading…
Add table
Reference in a new issue