Updating readability.

This commit is contained in:
Samuel Clay 2014-01-14 16:58:21 -08:00
parent eee1341cd3
commit a0bf83c113
2 changed files with 54 additions and 25 deletions

View file

@ -3,20 +3,19 @@ from encoding import get_encoding
from lxml.html import tostring from lxml.html import tostring
import logging import logging
import lxml.html import lxml.html
import re import re, sys
logging.getLogger().setLevel(logging.DEBUG)
utf8_parser = lxml.html.HTMLParser(encoding='utf-8') utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
def build_doc(page): def build_doc(page):
if isinstance(page, unicode): if isinstance(page, unicode):
enc = None
page_unicode = page page_unicode = page
else: else:
enc = get_encoding(page) enc = get_encoding(page) or 'utf-8'
page_unicode = page.decode(enc, 'replace') page_unicode = page.decode(enc, 'replace')
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
return doc return doc, enc
def js_re(src, pattern, flags, repl): def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
@ -57,7 +56,7 @@ def add_match(collection, text, orig):
def shorten_title(doc): def shorten_title(doc):
title = doc.find('.//title') title = doc.find('.//title')
if title is None or len(title.text) == 0: if title is None or title.text is None or len(title.text) == 0:
return '' return ''
title = orig = norm_title(title.text) title = orig = norm_title(title.text)
@ -111,5 +110,5 @@ def get_body(doc):
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it? #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned return cleaned
except Exception: #FIXME find the equivalent lxml error except Exception: #FIXME find the equivalent lxml error
logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
return raw_html return raw_html

View file

@ -76,13 +76,23 @@ def clean(text):
def text_length(i): def text_length(i):
return len(clean(i.text_content() or "")) return len(clean(i.text_content() or ""))
regexp_type = type(re.compile('hello, world'))
def compile_pattern(elements):
if not elements:
return None
if isinstance(elements, regexp_type):
return elements
if isinstance(elements, basestring):
elements = elements.split(',')
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
class Document: class Document:
"""Class to build a etree document out of html.""" """Class to build a etree document out of html."""
TEXT_LENGTH_THRESHOLD = 25 TEXT_LENGTH_THRESHOLD = 25
RETRY_LENGTH = 250 RETRY_LENGTH = 250
def __init__(self, input, **options): def __init__(self, input, positive_keywords=None, negative_keywords=None, **options):
"""Generate the document """Generate the document
:param input: string of the html content. :param input: string of the html content.
@ -93,11 +103,16 @@ class Document:
- min_text_length: - min_text_length:
- retry_length: - retry_length:
- url: will allow adjusting links to be absolute - url: will allow adjusting links to be absolute
- positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
- negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
Also positive_keywords and negative_keywords could be a regexp.
""" """
self.input = input self.input = input
self.options = options self.options = options
self.html = None self.html = None
self.encoding = None
self.positive_keywords = compile_pattern(positive_keywords)
self.negative_keywords = compile_pattern(negative_keywords)
def _html(self, force=False): def _html(self, force=False):
if force or self.html is None: if force or self.html is None:
@ -105,7 +120,7 @@ class Document:
return self.html return self.html
def _parse(self, input): def _parse(self, input):
doc = build_doc(input) doc, self.encoding = build_doc(input)
doc = html_cleaner.clean_html(doc) doc = html_cleaner.clean_html(doc)
base_href = self.options.get('url', None) base_href = self.options.get('url', None)
if base_href: if base_href:
@ -123,6 +138,9 @@ class Document:
def short_title(self): def short_title(self):
return shorten_title(self._html(True)) return shorten_title(self._html(True))
def get_clean_html(self):
return clean_attributes(tounicode(self.html))
def summary(self, html_partial=False): def summary(self, html_partial=False):
"""Generate the summary of the html docuemnt """Generate the summary of the html docuemnt
@ -308,19 +326,25 @@ class Document:
def class_weight(self, e): def class_weight(self, e):
weight = 0 weight = 0
if e.get('class', None): for feature in [e.get('class', None), e.get('id', None)]:
if REGEXES['negativeRe'].search(e.get('class')): if feature:
weight -= 25 if REGEXES['negativeRe'].search(feature):
weight -= 25
if REGEXES['positiveRe'].search(e.get('class')): if REGEXES['positiveRe'].search(feature):
weight += 25 weight += 25
if e.get('id', None): if self.positive_keywords and self.positive_keywords.search(feature):
if REGEXES['negativeRe'].search(e.get('id')): weight += 25
weight -= 25
if REGEXES['positiveRe'].search(e.get('id')): if self.negative_keywords and self.negative_keywords.search(feature):
weight += 25 weight -= 25
if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag):
weight += 25
if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag):
weight -= 25
return weight return weight
@ -530,7 +554,8 @@ class Document:
#el.attrib = {} #FIXME:Checkout the effects of disabling this #el.attrib = {} #FIXME:Checkout the effects of disabling this
pass pass
return clean_attributes(tounicode(node)) self.html = node
return self.get_clean_html()
class HashableElement(): class HashableElement():
@ -565,6 +590,8 @@ def main():
parser = OptionParser(usage="%prog: [options] [file]") parser = OptionParser(usage="%prog: [options] [file]")
parser.add_option('-v', '--verbose', action='store_true') parser.add_option('-v', '--verbose', action='store_true')
parser.add_option('-u', '--url', default=None, help="use URL instead of a local file") parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store')
parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store')
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
if not (len(args) == 1 or options.url): if not (len(args) == 1 or options.url):
@ -577,13 +604,16 @@ def main():
file = urllib.urlopen(options.url) file = urllib.urlopen(options.url)
else: else:
file = open(args[0], 'rt') file = open(args[0], 'rt')
enc = sys.__stdout__.encoding or 'utf-8' enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
try: try:
print Document(file.read(), print Document(file.read(),
debug=options.verbose, debug=options.verbose,
url=options.url).summary().encode(enc, 'replace') url=options.url,
positive_keywords = options.positive_keywords,
negative_keywords = options.negative_keywords,
).summary().encode(enc, 'replace')
finally: finally:
file.close() file.close()
if __name__ == '__main__': if __name__ == '__main__':
main() main()