mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-31 21:41:33 +00:00
Updating readability.
This commit is contained in:
parent
eee1341cd3
commit
a0bf83c113
2 changed files with 54 additions and 25 deletions
15
vendor/readability/htmls.py
vendored
15
vendor/readability/htmls.py
vendored
|
@ -3,20 +3,19 @@ from encoding import get_encoding
|
||||||
from lxml.html import tostring
|
from lxml.html import tostring
|
||||||
import logging
|
import logging
|
||||||
import lxml.html
|
import lxml.html
|
||||||
import re
|
import re, sys
|
||||||
|
|
||||||
logging.getLogger().setLevel(logging.DEBUG)
|
|
||||||
|
|
||||||
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
|
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
|
||||||
|
|
||||||
def build_doc(page):
|
def build_doc(page):
|
||||||
if isinstance(page, unicode):
|
if isinstance(page, unicode):
|
||||||
|
enc = None
|
||||||
page_unicode = page
|
page_unicode = page
|
||||||
else:
|
else:
|
||||||
enc = get_encoding(page)
|
enc = get_encoding(page) or 'utf-8'
|
||||||
page_unicode = page.decode(enc, 'replace')
|
page_unicode = page.decode(enc, 'replace')
|
||||||
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
|
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
|
||||||
return doc
|
return doc, enc
|
||||||
|
|
||||||
def js_re(src, pattern, flags, repl):
|
def js_re(src, pattern, flags, repl):
|
||||||
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
|
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
|
||||||
|
@ -57,7 +56,7 @@ def add_match(collection, text, orig):
|
||||||
|
|
||||||
def shorten_title(doc):
|
def shorten_title(doc):
|
||||||
title = doc.find('.//title')
|
title = doc.find('.//title')
|
||||||
if title is None or len(title.text) == 0:
|
if title is None or title.text is None or len(title.text) == 0:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
title = orig = norm_title(title.text)
|
title = orig = norm_title(title.text)
|
||||||
|
@ -111,5 +110,5 @@ def get_body(doc):
|
||||||
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
|
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
|
||||||
return cleaned
|
return cleaned
|
||||||
except Exception: #FIXME find the equivalent lxml error
|
except Exception: #FIXME find the equivalent lxml error
|
||||||
logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
|
#logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
|
||||||
return raw_html
|
return raw_html
|
64
vendor/readability/readability.py
vendored
64
vendor/readability/readability.py
vendored
|
@ -76,13 +76,23 @@ def clean(text):
|
||||||
def text_length(i):
|
def text_length(i):
|
||||||
return len(clean(i.text_content() or ""))
|
return len(clean(i.text_content() or ""))
|
||||||
|
|
||||||
|
regexp_type = type(re.compile('hello, world'))
|
||||||
|
|
||||||
|
def compile_pattern(elements):
|
||||||
|
if not elements:
|
||||||
|
return None
|
||||||
|
if isinstance(elements, regexp_type):
|
||||||
|
return elements
|
||||||
|
if isinstance(elements, basestring):
|
||||||
|
elements = elements.split(',')
|
||||||
|
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
|
||||||
|
|
||||||
class Document:
|
class Document:
|
||||||
"""Class to build a etree document out of html."""
|
"""Class to build a etree document out of html."""
|
||||||
TEXT_LENGTH_THRESHOLD = 25
|
TEXT_LENGTH_THRESHOLD = 25
|
||||||
RETRY_LENGTH = 250
|
RETRY_LENGTH = 250
|
||||||
|
|
||||||
def __init__(self, input, **options):
|
def __init__(self, input, positive_keywords=None, negative_keywords=None, **options):
|
||||||
"""Generate the document
|
"""Generate the document
|
||||||
|
|
||||||
:param input: string of the html content.
|
:param input: string of the html content.
|
||||||
|
@ -93,11 +103,16 @@ class Document:
|
||||||
- min_text_length:
|
- min_text_length:
|
||||||
- retry_length:
|
- retry_length:
|
||||||
- url: will allow adjusting links to be absolute
|
- url: will allow adjusting links to be absolute
|
||||||
|
- positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
|
||||||
|
- negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
|
||||||
|
Also positive_keywords and negative_keywords could be a regexp.
|
||||||
"""
|
"""
|
||||||
self.input = input
|
self.input = input
|
||||||
self.options = options
|
self.options = options
|
||||||
self.html = None
|
self.html = None
|
||||||
|
self.encoding = None
|
||||||
|
self.positive_keywords = compile_pattern(positive_keywords)
|
||||||
|
self.negative_keywords = compile_pattern(negative_keywords)
|
||||||
|
|
||||||
def _html(self, force=False):
|
def _html(self, force=False):
|
||||||
if force or self.html is None:
|
if force or self.html is None:
|
||||||
|
@ -105,7 +120,7 @@ class Document:
|
||||||
return self.html
|
return self.html
|
||||||
|
|
||||||
def _parse(self, input):
|
def _parse(self, input):
|
||||||
doc = build_doc(input)
|
doc, self.encoding = build_doc(input)
|
||||||
doc = html_cleaner.clean_html(doc)
|
doc = html_cleaner.clean_html(doc)
|
||||||
base_href = self.options.get('url', None)
|
base_href = self.options.get('url', None)
|
||||||
if base_href:
|
if base_href:
|
||||||
|
@ -123,6 +138,9 @@ class Document:
|
||||||
def short_title(self):
|
def short_title(self):
|
||||||
return shorten_title(self._html(True))
|
return shorten_title(self._html(True))
|
||||||
|
|
||||||
|
def get_clean_html(self):
|
||||||
|
return clean_attributes(tounicode(self.html))
|
||||||
|
|
||||||
def summary(self, html_partial=False):
|
def summary(self, html_partial=False):
|
||||||
"""Generate the summary of the html docuemnt
|
"""Generate the summary of the html docuemnt
|
||||||
|
|
||||||
|
@ -308,19 +326,25 @@ class Document:
|
||||||
|
|
||||||
def class_weight(self, e):
|
def class_weight(self, e):
|
||||||
weight = 0
|
weight = 0
|
||||||
if e.get('class', None):
|
for feature in [e.get('class', None), e.get('id', None)]:
|
||||||
if REGEXES['negativeRe'].search(e.get('class')):
|
if feature:
|
||||||
weight -= 25
|
if REGEXES['negativeRe'].search(feature):
|
||||||
|
weight -= 25
|
||||||
|
|
||||||
if REGEXES['positiveRe'].search(e.get('class')):
|
if REGEXES['positiveRe'].search(feature):
|
||||||
weight += 25
|
weight += 25
|
||||||
|
|
||||||
if e.get('id', None):
|
if self.positive_keywords and self.positive_keywords.search(feature):
|
||||||
if REGEXES['negativeRe'].search(e.get('id')):
|
weight += 25
|
||||||
weight -= 25
|
|
||||||
|
|
||||||
if REGEXES['positiveRe'].search(e.get('id')):
|
if self.negative_keywords and self.negative_keywords.search(feature):
|
||||||
weight += 25
|
weight -= 25
|
||||||
|
|
||||||
|
if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag):
|
||||||
|
weight += 25
|
||||||
|
|
||||||
|
if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag):
|
||||||
|
weight -= 25
|
||||||
|
|
||||||
return weight
|
return weight
|
||||||
|
|
||||||
|
@ -530,7 +554,8 @@ class Document:
|
||||||
#el.attrib = {} #FIXME:Checkout the effects of disabling this
|
#el.attrib = {} #FIXME:Checkout the effects of disabling this
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return clean_attributes(tounicode(node))
|
self.html = node
|
||||||
|
return self.get_clean_html()
|
||||||
|
|
||||||
|
|
||||||
class HashableElement():
|
class HashableElement():
|
||||||
|
@ -565,6 +590,8 @@ def main():
|
||||||
parser = OptionParser(usage="%prog: [options] [file]")
|
parser = OptionParser(usage="%prog: [options] [file]")
|
||||||
parser.add_option('-v', '--verbose', action='store_true')
|
parser.add_option('-v', '--verbose', action='store_true')
|
||||||
parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
|
parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
|
||||||
|
parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store')
|
||||||
|
parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store')
|
||||||
(options, args) = parser.parse_args()
|
(options, args) = parser.parse_args()
|
||||||
|
|
||||||
if not (len(args) == 1 or options.url):
|
if not (len(args) == 1 or options.url):
|
||||||
|
@ -577,13 +604,16 @@ def main():
|
||||||
file = urllib.urlopen(options.url)
|
file = urllib.urlopen(options.url)
|
||||||
else:
|
else:
|
||||||
file = open(args[0], 'rt')
|
file = open(args[0], 'rt')
|
||||||
enc = sys.__stdout__.encoding or 'utf-8'
|
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
|
||||||
try:
|
try:
|
||||||
print Document(file.read(),
|
print Document(file.read(),
|
||||||
debug=options.verbose,
|
debug=options.verbose,
|
||||||
url=options.url).summary().encode(enc, 'replace')
|
url=options.url,
|
||||||
|
positive_keywords = options.positive_keywords,
|
||||||
|
negative_keywords = options.negative_keywords,
|
||||||
|
).summary().encode(enc, 'replace')
|
||||||
finally:
|
finally:
|
||||||
file.close()
|
file.close()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
Loading…
Add table
Reference in a new issue