NewsBlur/apps/rss_feeds/icon_importer.py

396 lines
14 KiB
Python
Raw Normal View History

2020-07-01 16:59:21 -04:00
import urllib.request
import urllib.error
import urllib.parse
import lxml.html
import numpy
import scipy
import scipy.misc
import scipy.cluster
import struct
import operator
2012-09-18 17:09:07 -07:00
import gzip
2013-03-22 18:29:54 -07:00
import datetime
2015-05-18 17:51:23 -07:00
import requests
2020-07-01 16:59:21 -04:00
import base64
2020-06-15 02:54:37 -04:00
import http.client
2013-06-21 13:15:04 -07:00
from PIL import BmpImagePlugin, PngImagePlugin, Image
from socket import error as SocketError
2012-09-19 16:33:23 -07:00
from boto.s3.key import Key
2020-07-01 11:57:51 -04:00
from io import BytesIO
2012-09-18 17:09:07 -07:00
from django.conf import settings
from apps.rss_feeds.models import MFeedPage, MFeedIcon
from utils.facebook_fetcher import FacebookFetcher
from utils import log as logging
2011-02-02 13:32:52 -05:00
from utils.feed_functions import timelimit, TimeoutError
from OpenSSL.SSL import Error as OpenSSLError
from pyasn1.error import PyAsn1Error
from requests.packages.urllib3.exceptions import LocationParseError
class IconImporter(object):
2015-12-02 12:15:56 +01:00
def __init__(self, feed, page_data=None, force=False):
self.feed = feed
self.force = force
self.page_data = page_data
2014-05-16 12:18:59 -07:00
self.feed_icon = MFeedIcon.get_feed(feed_id=self.feed.pk)
2015-12-02 12:15:56 +01:00
def save(self):
if not self.force and self.feed.favicon_not_found:
# print 'Not found, skipping...'
return
2015-12-02 12:15:56 +01:00
if (
not self.force
and not self.feed.favicon_not_found
and self.feed_icon.icon_url
and self.feed.s3_icon
):
# print 'Found, but skipping...'
return
if 'facebook.com' in self.feed.feed_address:
image, image_file, icon_url = self.fetch_facebook_image()
else:
image, image_file, icon_url = self.fetch_image_from_page_data()
if not image:
image, image_file, icon_url = self.fetch_image_from_path(force=self.force)
if image:
2015-12-02 12:15:56 +01:00
image = self.normalize_image(image)
2013-10-07 16:16:38 -07:00
try:
2015-12-02 12:15:56 +01:00
color = self.determine_dominant_color_in_image(image)
2013-10-07 16:16:38 -07:00
except IndexError:
return
2019-10-13 09:16:17 -04:00
except MemoryError:
return
2013-07-05 17:43:14 -07:00
try:
image_str = self.string_from_image(image)
except TypeError:
return
2015-12-02 12:15:56 +01:00
2013-06-04 16:30:21 -07:00
if len(image_str) > 500000:
image = None
2015-12-02 12:15:56 +01:00
if (image and
(self.force or
2015-12-02 12:15:56 +01:00
self.feed_icon.data != image_str or
2013-06-04 16:30:21 -07:00
self.feed_icon.icon_url != icon_url or
self.feed_icon.not_found or
(settings.BACKED_BY_AWS.get('icons_on_s3') and not self.feed.s3_icon))):
logging.debug(" ---> [%-30s] ~SN~FBIcon difference:~FY color:%s (%s/%s) data:%s url:%s notfound:%s no-s3:%s" % (
self.feed.log_title[:30],
self.feed_icon.color != color, self.feed_icon.color, color,
self.feed_icon.data != image_str,
self.feed_icon.icon_url != icon_url,
self.feed_icon.not_found,
settings.BACKED_BY_AWS.get('icons_on_s3') and not self.feed.s3_icon))
2015-12-02 12:15:56 +01:00
self.feed_icon.data = image_str
self.feed_icon.icon_url = icon_url
self.feed_icon.color = color
2012-03-29 16:03:06 -07:00
self.feed_icon.not_found = False
self.feed_icon.save()
2012-09-19 16:33:23 -07:00
if settings.BACKED_BY_AWS.get('icons_on_s3'):
self.save_to_s3(image_str)
if self.feed.favicon_color != color:
2015-12-02 12:15:56 +01:00
self.feed.favicon_color = color
self.feed.favicon_not_found = False
self.feed.save(update_fields=['favicon_color', 'favicon_not_found'])
2013-06-04 16:30:21 -07:00
if not image:
self.feed_icon.not_found = True
2016-04-06 17:06:32 -07:00
self.feed_icon.save()
self.feed.favicon_not_found = True
2016-04-06 17:06:32 -07:00
self.feed.save()
return not self.feed.favicon_not_found
2012-09-19 16:33:23 -07:00
def save_to_s3(self, image_str):
2013-03-22 18:29:54 -07:00
expires = datetime.datetime.now() + datetime.timedelta(days=60)
expires = expires.strftime("%a, %d %b %Y %H:%M:%S GMT")
2017-03-28 21:17:06 -07:00
k = Key(settings.S3_CONN.get_bucket(settings.S3_ICONS_BUCKET_NAME))
2012-09-19 16:33:23 -07:00
k.key = self.feed.s3_icons_key
k.set_metadata('Content-Type', 'image/png')
2013-03-22 18:29:54 -07:00
k.set_metadata('Expires', expires)
2020-07-01 16:59:21 -04:00
k.set_contents_from_string(base64.b64decode(image_str))
2012-09-19 16:33:23 -07:00
k.set_acl('public-read')
2015-12-02 12:15:56 +01:00
2012-09-19 16:33:23 -07:00
self.feed.s3_icon = True
2016-04-06 17:06:32 -07:00
self.feed.save()
2015-12-02 12:15:56 +01:00
def load_icon(self, image_file, index=None):
'''
DEPRECATED
2015-12-02 12:15:56 +01:00
Load Windows ICO image.
See http://en.wikipedia.org/w/index.php?oldid=264332061 for file format
description.
2015-12-02 12:15:56 +01:00
Cribbed and modified from http://djangosnippets.org/snippets/1287/
'''
try:
image_file.seek(0)
header = struct.unpack('<3H', image_file.read(6))
2020-07-01 16:59:21 -04:00
except Exception:
return
# Check magic
if header[:2] != (0, 1):
return
# Collect icon directories
directories = []
2020-06-15 02:54:37 -04:00
for i in range(header[2]):
directory = list(struct.unpack('<4B2H2I', image_file.read(16)))
2020-06-15 02:54:37 -04:00
for j in range(3):
if not directory[j]:
directory[j] = 256
directories.append(directory)
if index is None:
# Select best icon
directory = max(directories, key=operator.itemgetter(slice(0, 3)))
else:
directory = directories[index]
# Seek to the bitmap data
image_file.seek(directory[7])
prefix = image_file.read(16)
image_file.seek(-16, 1)
if PngImagePlugin._accept(prefix):
# Windows Vista icon with PNG inside
try:
image = PngImagePlugin.PngImageFile(image_file)
except IOError:
return
else:
# Load XOR bitmap
2013-08-21 12:40:04 -07:00
try:
image = BmpImagePlugin.DibImageFile(image_file)
except IOError:
return
if image.mode == 'RGBA':
# Windows XP 32-bit color depth icon without AND bitmap
pass
else:
# Patch up the bitmap height
image.size = image.size[0], image.size[1] >> 1
d, e, o, a = image.tile[0]
image.tile[0] = d, (0, 0) + image.size, o, a
# Calculate AND bitmap dimensions. See
# http://en.wikipedia.org/w/index.php?oldid=264236948#Pixel_storage
# for description
offset = o + a[1] * image.size[1]
stride = ((image.size[0] + 31) >> 5) << 2
size = stride * image.size[1]
# Load AND bitmap
image_file.seek(offset)
string = image_file.read(size)
2014-05-16 12:18:59 -07:00
mask = Image.frombytes('1', image.size, string, 'raw',
('1;I', stride, -1))
2015-12-02 12:15:56 +01:00
image = image.convert('RGBA')
image.putalpha(mask)
return image
2015-12-02 12:15:56 +01:00
def fetch_image_from_page_data(self):
image = None
image_file = None
if self.page_data:
content = self.page_data
2012-09-18 17:09:07 -07:00
elif settings.BACKED_BY_AWS.get('pages_on_s3') and self.feed.s3_page:
2017-03-28 21:17:06 -07:00
key = settings.S3_CONN.get_bucket(settings.S3_PAGES_BUCKET_NAME).get_key(self.feed.s3_pages_key)
2012-09-18 17:09:07 -07:00
compressed_content = key.get_contents_as_string()
2020-07-01 11:57:51 -04:00
stream = BytesIO(compressed_content)
2012-09-18 17:09:07 -07:00
gz = gzip.GzipFile(fileobj=stream)
2012-10-01 19:31:33 -07:00
try:
content = gz.read()
except IOError:
content = None
else:
content = MFeedPage.get_data(feed_id=self.feed.pk)
url = self._url_from_html(content)
2015-05-18 17:51:23 -07:00
if not url:
try:
content = requests.get(self.cleaned_feed_link).content
url = self._url_from_html(content)
2015-12-02 12:15:56 +01:00
except (AttributeError, SocketError, requests.ConnectionError,
requests.models.MissingSchema, requests.sessions.InvalidSchema,
requests.sessions.TooManyRedirects,
2015-07-07 13:35:00 -07:00
requests.models.InvalidURL,
requests.models.ChunkedEncodingError,
requests.models.ContentDecodingError,
2020-06-15 02:54:37 -04:00
http.client.IncompleteRead,
LocationParseError, OpenSSLError, PyAsn1Error,
2020-06-15 02:54:37 -04:00
ValueError) as e:
logging.debug(" ---> ~SN~FRFailed~FY to fetch ~FGfeed icon~FY: %s" % e)
if url:
image, image_file = self.get_image_from_url(url)
return image, image_file, url
@property
def cleaned_feed_link(self):
if self.feed.feed_link.startswith('http'):
return self.feed.feed_link
return 'http://' + self.feed.feed_link
def fetch_image_from_path(self, path='favicon.ico', force=False):
image = None
url = None
if not force:
url = self.feed_icon.icon_url
2011-01-30 13:07:55 -05:00
if not url and self.feed.feed_link and len(self.feed.feed_link) > 6:
2017-03-31 20:30:43 -07:00
try:
2020-06-15 02:54:37 -04:00
url = urllib.parse.urljoin(self.feed.feed_link, 'favicon.ico')
2017-03-31 20:30:43 -07:00
except ValueError:
url = None
2015-12-02 12:15:56 +01:00
if not url:
return None, None, None
image, image_file = self.get_image_from_url(url)
if not image:
2020-06-15 02:54:37 -04:00
url = urllib.parse.urljoin(self.feed.feed_link, '/favicon.ico')
image, image_file = self.get_image_from_url(url)
# print 'Found: %s - %s' % (url, image)
return image, image_file, url
def fetch_facebook_image(self):
facebook_fetcher = FacebookFetcher(self.feed)
url = facebook_fetcher.favicon_url()
image, image_file = self.get_image_from_url(url)
if not image:
2020-06-15 02:54:37 -04:00
url = urllib.parse.urljoin(self.feed.feed_link, '/favicon.ico')
image, image_file = self.get_image_from_url(url)
# print 'Found: %s - %s' % (url, image)
return image, image_file, url
def get_image_from_url(self, url):
# print 'Requesting: %s' % url
2012-10-01 19:31:33 -07:00
if not url:
return None, None
2015-12-02 12:15:56 +01:00
@timelimit(30)
def _1(url):
headers = {
'User-Agent': 'NewsBlur Favicon Fetcher - %s subscriber%s - %s '
'(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
2015-12-02 12:15:56 +01:00
'Safari/534.48.3)' %
(
self.feed.num_subscribers,
's' if self.feed.num_subscribers != 1 else '',
self.feed.permalink
),
'Connection': 'close',
'Accept': 'image/png,image/x-icon,image/*;q=0.9,*/*;q=0.8'
}
2011-02-02 13:32:52 -05:00
try:
2020-06-15 02:54:37 -04:00
request = urllib.request.Request(url, headers=headers)
icon = urllib.request.urlopen(request).read()
2011-02-02 13:32:52 -05:00
except Exception:
return None
2011-02-02 13:24:58 -05:00
return icon
try:
2011-02-02 13:24:58 -05:00
icon = _1(url)
2011-02-02 13:32:52 -05:00
except TimeoutError:
return None, None
2015-12-02 12:15:56 +01:00
2011-02-02 13:24:58 -05:00
try:
2020-07-01 11:57:51 -04:00
icon_file = BytesIO(icon)
2011-02-02 13:24:58 -05:00
image = Image.open(icon_file)
except (IOError, ValueError):
return None, None
2015-12-02 12:15:56 +01:00
return image, icon_file
2015-12-02 12:15:56 +01:00
def _url_from_html(self, content):
url = None
2015-12-02 12:15:56 +01:00
if not content:
return url
try:
2020-06-15 02:54:37 -04:00
if isinstance(content, str):
content = content.encode('utf-8')
icon_path = lxml.html.fromstring(content).xpath(
'//link[@rel="icon" or @rel="shortcut icon"]/@href'
)
except (lxml.etree.ParserError, TypeError):
return url
2015-12-02 12:15:56 +01:00
if icon_path:
if str(icon_path[0]).startswith('http'):
url = icon_path[0]
else:
2020-06-15 02:54:37 -04:00
url = urllib.parse.urljoin(self.feed.feed_link, icon_path[0])
return url
2015-12-02 12:15:56 +01:00
def normalize_image(self, image):
# if image.size != (16, 16):
# image = image.resize((16, 16), Image.BICUBIC)
if image.mode != 'RGBA':
try:
image = image.convert('RGBA')
except IOError:
pass
2015-12-02 12:15:56 +01:00
return image
def determine_dominant_color_in_image(self, image):
NUM_CLUSTERS = 5
2015-12-02 12:15:56 +01:00
# Convert image into array of values for each point.
if image.mode == '1':
image.convert('L')
ar = numpy.array(image)
# ar = scipy.misc.fromimage(image)
shape = ar.shape
2015-12-02 12:15:56 +01:00
# Reshape array of values to merge color bands. [[R], [G], [B], [A]] => [R, G, B, A]
if len(shape) > 2:
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
# Get NUM_CLUSTERS worth of centroids.
2016-02-16 13:24:09 -08:00
ar = ar.astype(numpy.float)
codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
2015-12-02 12:15:56 +01:00
# Pare centroids, removing blacks and whites and shades of really dark and really light.
original_codes = codes
for low, hi in [(60, 200), (35, 230), (10, 250)]:
2015-12-02 12:15:56 +01:00
codes = scipy.array([code for code in codes
if not ((code[0] < low and code[1] < low and code[2] < low) or
(code[0] > hi and code[1] > hi and code[2] > hi))])
2015-12-02 12:15:56 +01:00
if not len(codes):
codes = original_codes
else:
break
# Assign codes (vector quantization). Each vector is compared to the centroids
# and assigned the nearest one.
vecs, _ = scipy.cluster.vq.vq(ar, codes)
2015-12-02 12:15:56 +01:00
# Count occurences of each clustered vector.
counts, bins = scipy.histogram(vecs, len(codes))
# Show colors for each code in its hex value.
# colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
# total = scipy.sum(counts)
# print dict(zip(colors, [count/float(total) for count in counts]))
2015-12-02 12:15:56 +01:00
# Find the most frequent color, based on the counts.
index_max = scipy.argmax(counts)
peak = codes.astype(int)[index_max]
2020-07-01 16:59:21 -04:00
color = "{:02x}{:02x}{:02x}".format(peak[0], peak[1], peak[2])
2015-12-02 12:15:56 +01:00
return color[:6]
def string_from_image(self, image):
2020-07-01 11:57:51 -04:00
output = BytesIO()
image.save(output, 'png', quality=95)
contents = output.getvalue()
output.close()
2020-07-01 18:38:37 -04:00
return base64.b64encode(contents).decode()