NewsBlur/apps/rss_feeds/icon_importer.py

416 lines
15 KiB
Python
Raw Normal View History

import base64
import datetime
import gzip
import http.client
import operator
import struct
2020-07-01 16:59:21 -04:00
import urllib.error
import urllib.parse
import urllib.request
from io import BytesIO
from socket import error as SocketError
import boto3
import lxml.html
import numpy
import requests
import scipy
import scipy.cluster
import scipy.misc
2012-09-18 17:09:07 -07:00
from django.conf import settings
from django.contrib.sites.models import Site
from django.http import HttpResponse
from OpenSSL.SSL import SESS_CACHE_NO_INTERNAL_STORE
from OpenSSL.SSL import Error as OpenSSLError
from PIL import BmpImagePlugin, Image, PngImagePlugin
from pyasn1.error import PyAsn1Error
from requests.packages.urllib3.exceptions import LocationParseError
from apps.rss_feeds.models import MFeedIcon, MFeedPage
from utils import log as logging
from utils.facebook_fetcher import FacebookFetcher
from utils.feed_functions import TimeoutError, timelimit
class IconImporter(object):
2015-12-02 12:15:56 +01:00
def __init__(self, feed, page_data=None, force=False):
self.feed = feed
self.force = force
self.page_data = page_data
2014-05-16 12:18:59 -07:00
self.feed_icon = MFeedIcon.get_feed(feed_id=self.feed.pk)
2015-12-02 12:15:56 +01:00
def save(self):
if not self.force and self.feed.favicon_not_found:
# print 'Not found, skipping...'
return
2015-12-02 12:15:56 +01:00
if (
not self.force
and not self.feed.favicon_not_found
and self.feed_icon.icon_url
and self.feed.s3_icon
):
# print 'Found, but skipping...'
return
if 'facebook.com' in self.feed.feed_address:
image, image_file, icon_url = self.fetch_facebook_image()
else:
image, image_file, icon_url = self.fetch_image_from_page_data()
if not image:
image, image_file, icon_url = self.fetch_image_from_path(force=self.force)
2021-04-16 17:25:57 -04:00
2013-06-04 16:30:21 -07:00
if not image:
self.feed_icon.not_found = True
2016-04-06 17:06:32 -07:00
self.feed_icon.save()
self.feed.favicon_not_found = True
2016-04-06 17:06:32 -07:00
self.feed.save()
2021-04-16 17:25:57 -04:00
return False
image = self.normalize_image(image)
try:
color = self.determine_dominant_color_in_image(image)
except (IndexError, ValueError, MemoryError):
logging.debug(" ---> [%-30s] ~SN~FRFailed to measure icon" % self.feed.log_title[:30])
return
try:
image_str = self.string_from_image(image)
except TypeError:
return
if len(image_str) > 500000:
image = None
if (image and
(self.force or
self.feed_icon.data != image_str or
self.feed_icon.icon_url != icon_url or
self.feed_icon.not_found or
(settings.BACKED_BY_AWS.get('icons_on_s3') and not self.feed.s3_icon))):
logging.debug(" ---> [%-30s] ~SN~FBIcon difference:~FY color:%s (%s/%s) data:%s url:%s notfound:%s no-s3:%s" % (
self.feed.log_title[:30],
self.feed_icon.color != color, self.feed_icon.color, color,
self.feed_icon.data != image_str,
self.feed_icon.icon_url != icon_url,
self.feed_icon.not_found,
settings.BACKED_BY_AWS.get('icons_on_s3') and not self.feed.s3_icon))
self.feed_icon.data = image_str
self.feed_icon.icon_url = icon_url
self.feed_icon.color = color
self.feed_icon.not_found = False
self.feed_icon.save()
if settings.BACKED_BY_AWS.get('icons_on_s3'):
self.save_to_s3(image_str)
if self.feed.favicon_color != color:
self.feed.favicon_color = color
self.feed.favicon_not_found = False
self.feed.save(update_fields=['favicon_color', 'favicon_not_found'])
2016-04-06 17:06:32 -07:00
return not self.feed.favicon_not_found
2012-09-19 16:33:23 -07:00
def save_to_s3(self, image_str):
2013-03-22 18:29:54 -07:00
expires = datetime.datetime.now() + datetime.timedelta(days=60)
expires = expires.strftime("%a, %d %b %Y %H:%M:%S GMT")
base64.b64decode(image_str)
settings.S3_CONN.Object(settings.S3_ICONS_BUCKET_NAME,
self.feed.s3_icons_key).put(Body=base64.b64decode(image_str),
2021-08-26 16:21:36 -04:00
ContentType='image/png',
Expires=expires,
ACL='public-read'
)
2015-12-02 12:15:56 +01:00
2012-09-19 16:33:23 -07:00
self.feed.s3_icon = True
2016-04-06 17:06:32 -07:00
self.feed.save()
2015-12-02 12:15:56 +01:00
def load_icon(self, image_file, index=None):
'''
DEPRECATED
2015-12-02 12:15:56 +01:00
Load Windows ICO image.
See http://en.wikipedia.org/w/index.php?oldid=264332061 for file format
description.
2015-12-02 12:15:56 +01:00
Cribbed and modified from http://djangosnippets.org/snippets/1287/
'''
try:
image_file.seek(0)
header = struct.unpack('<3H', image_file.read(6))
2020-07-01 16:59:21 -04:00
except Exception:
return
# Check magic
if header[:2] != (0, 1):
return
# Collect icon directories
directories = []
2020-06-15 02:54:37 -04:00
for i in range(header[2]):
directory = list(struct.unpack('<4B2H2I', image_file.read(16)))
2020-06-15 02:54:37 -04:00
for j in range(3):
if not directory[j]:
directory[j] = 256
directories.append(directory)
if index is None:
# Select best icon
directory = max(directories, key=operator.itemgetter(slice(0, 3)))
else:
directory = directories[index]
# Seek to the bitmap data
image_file.seek(directory[7])
prefix = image_file.read(16)
image_file.seek(-16, 1)
if PngImagePlugin._accept(prefix):
# Windows Vista icon with PNG inside
try:
image = PngImagePlugin.PngImageFile(image_file)
except IOError:
return
else:
# Load XOR bitmap
2013-08-21 12:40:04 -07:00
try:
image = BmpImagePlugin.DibImageFile(image_file)
except IOError:
return
if image.mode == 'RGBA':
# Windows XP 32-bit color depth icon without AND bitmap
pass
else:
# Patch up the bitmap height
image.size = image.size[0], image.size[1] >> 1
d, e, o, a = image.tile[0]
image.tile[0] = d, (0, 0) + image.size, o, a
# Calculate AND bitmap dimensions. See
# http://en.wikipedia.org/w/index.php?oldid=264236948#Pixel_storage
# for description
offset = o + a[1] * image.size[1]
stride = ((image.size[0] + 31) >> 5) << 2
size = stride * image.size[1]
# Load AND bitmap
image_file.seek(offset)
string = image_file.read(size)
2014-05-16 12:18:59 -07:00
mask = Image.frombytes('1', image.size, string, 'raw',
('1;I', stride, -1))
2015-12-02 12:15:56 +01:00
image = image.convert('RGBA')
image.putalpha(mask)
return image
2015-12-02 12:15:56 +01:00
def fetch_image_from_page_data(self):
image = None
image_file = None
2021-05-05 12:07:56 -04:00
content = None
if self.page_data:
content = self.page_data
elif settings.BACKED_BY_AWS.get('pages_on_node'):
domain = "node-page.service.consul:8008"
if settings.DOCKERBUILD:
domain = "node:8008"
url = "http://%s/original_page/%s" % (
domain,
self.feed.pk,
)
try:
page_response = requests.get(url)
if page_response.status_code == 200:
content = page_response.content
except requests.ConnectionError:
pass
2012-09-18 17:09:07 -07:00
elif settings.BACKED_BY_AWS.get('pages_on_s3') and self.feed.s3_page:
key = settings.S3_CONN.Bucket(settings.S3_PAGES_BUCKET_NAME).Object(key=self.feed.s3_pages_key)
compressed_content = key.get()["Body"].read()
2020-07-01 11:57:51 -04:00
stream = BytesIO(compressed_content)
2012-09-18 17:09:07 -07:00
gz = gzip.GzipFile(fileobj=stream)
2012-10-01 19:31:33 -07:00
try:
content = gz.read()
except IOError:
2021-05-05 12:07:56 -04:00
pass
else:
content = MFeedPage.get_data(feed_id=self.feed.pk)
url = self._url_from_html(content)
2015-05-18 17:51:23 -07:00
if not url:
try:
content = requests.get(self.cleaned_feed_link, timeout=10).content
url = self._url_from_html(content)
2015-12-02 12:15:56 +01:00
except (AttributeError, SocketError, requests.ConnectionError,
requests.models.MissingSchema, requests.sessions.InvalidSchema,
requests.sessions.TooManyRedirects,
2015-07-07 13:35:00 -07:00
requests.models.InvalidURL,
requests.models.ChunkedEncodingError,
requests.models.ContentDecodingError,
2020-06-15 02:54:37 -04:00
http.client.IncompleteRead,
requests.adapters.ReadTimeout,
LocationParseError, OpenSSLError, PyAsn1Error,
2020-06-15 02:54:37 -04:00
ValueError) as e:
logging.debug(" ---> ~SN~FRFailed~FY to fetch ~FGfeed icon~FY: %s" % e)
if url:
image, image_file = self.get_image_from_url(url)
return image, image_file, url
@property
def cleaned_feed_link(self):
if self.feed.feed_link.startswith('http'):
return self.feed.feed_link
return 'http://' + self.feed.feed_link
def fetch_image_from_path(self, path='favicon.ico', force=False):
image = None
url = None
if not force:
url = self.feed_icon.icon_url
2011-01-30 13:07:55 -05:00
if not url and self.feed.feed_link and len(self.feed.feed_link) > 6:
2017-03-31 20:30:43 -07:00
try:
2020-06-15 02:54:37 -04:00
url = urllib.parse.urljoin(self.feed.feed_link, 'favicon.ico')
2017-03-31 20:30:43 -07:00
except ValueError:
url = None
2015-12-02 12:15:56 +01:00
if not url:
return None, None, None
image, image_file = self.get_image_from_url(url)
if not image:
2020-06-15 02:54:37 -04:00
url = urllib.parse.urljoin(self.feed.feed_link, '/favicon.ico')
image, image_file = self.get_image_from_url(url)
# print 'Found: %s - %s' % (url, image)
return image, image_file, url
def fetch_facebook_image(self):
facebook_fetcher = FacebookFetcher(self.feed)
url = facebook_fetcher.favicon_url()
image, image_file = self.get_image_from_url(url)
if not image:
2020-06-15 02:54:37 -04:00
url = urllib.parse.urljoin(self.feed.feed_link, '/favicon.ico')
image, image_file = self.get_image_from_url(url)
# print 'Found: %s - %s' % (url, image)
return image, image_file, url
def get_image_from_url(self, url):
# print 'Requesting: %s' % url
2012-10-01 19:31:33 -07:00
if not url:
return None, None
2015-12-02 12:15:56 +01:00
@timelimit(30)
def _1(url):
headers = {
2020-11-30 15:48:59 -05:00
'User-Agent': 'NewsBlur Favicon Fetcher - %s subscriber%s - %s %s' %
2015-12-02 12:15:56 +01:00
(
self.feed.num_subscribers,
's' if self.feed.num_subscribers != 1 else '',
2020-11-30 15:48:59 -05:00
self.feed.permalink,
self.feed.fake_user_agent,
2015-12-02 12:15:56 +01:00
),
'Connection': 'close',
'Accept': 'image/png,image/x-icon,image/*;q=0.9,*/*;q=0.8'
}
2011-02-02 13:32:52 -05:00
try:
2020-06-15 02:54:37 -04:00
request = urllib.request.Request(url, headers=headers)
icon = urllib.request.urlopen(request).read()
2011-02-02 13:32:52 -05:00
except Exception:
return None
2011-02-02 13:24:58 -05:00
return icon
try:
2011-02-02 13:24:58 -05:00
icon = _1(url)
2011-02-02 13:32:52 -05:00
except TimeoutError:
return None, None
2015-12-02 12:15:56 +01:00
2011-02-02 13:24:58 -05:00
try:
2020-07-01 11:57:51 -04:00
icon_file = BytesIO(icon)
2011-02-02 13:24:58 -05:00
image = Image.open(icon_file)
except (IOError, ValueError):
return None, None
2015-12-02 12:15:56 +01:00
return image, icon_file
2015-12-02 12:15:56 +01:00
def _url_from_html(self, content):
url = None
2015-12-02 12:15:56 +01:00
if not content:
return url
try:
2020-06-15 02:54:37 -04:00
if isinstance(content, str):
content = content.encode('utf-8')
icon_path = lxml.html.fromstring(content).xpath(
'//link[@rel="icon" or @rel="shortcut icon"]/@href'
)
except (lxml.etree.ParserError, TypeError):
return url
2015-12-02 12:15:56 +01:00
if icon_path:
if str(icon_path[0]).startswith('http'):
url = icon_path[0]
else:
2020-06-15 02:54:37 -04:00
url = urllib.parse.urljoin(self.feed.feed_link, icon_path[0])
return url
2015-12-02 12:15:56 +01:00
def normalize_image(self, image):
# if image.size != (16, 16):
# image = image.resize((16, 16), Image.BICUBIC)
if image.mode != 'RGBA':
try:
image = image.convert('RGBA')
except IOError:
pass
2015-12-02 12:15:56 +01:00
return image
def determine_dominant_color_in_image(self, image):
NUM_CLUSTERS = 5
2015-12-02 12:15:56 +01:00
# Convert image into array of values for each point.
if image.mode == '1':
image.convert('L')
ar = numpy.array(image)
# ar = scipy.misc.fromimage(image)
shape = ar.shape
2015-12-02 12:15:56 +01:00
# Reshape array of values to merge color bands. [[R], [G], [B], [A]] => [R, G, B, A]
if len(shape) > 2:
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
# Get NUM_CLUSTERS worth of centroids.
2016-02-16 13:24:09 -08:00
ar = ar.astype(numpy.float)
codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
2015-12-02 12:15:56 +01:00
# Pare centroids, removing blacks and whites and shades of really dark and really light.
original_codes = codes
for low, hi in [(60, 200), (35, 230), (10, 250)]:
2015-12-02 12:15:56 +01:00
codes = scipy.array([code for code in codes
if not ((code[0] < low and code[1] < low and code[2] < low) or
(code[0] > hi and code[1] > hi and code[2] > hi))])
2015-12-02 12:15:56 +01:00
if not len(codes):
codes = original_codes
else:
break
# Assign codes (vector quantization). Each vector is compared to the centroids
# and assigned the nearest one.
vecs, _ = scipy.cluster.vq.vq(ar, codes)
2015-12-02 12:15:56 +01:00
# Count occurences of each clustered vector.
counts, bins = scipy.histogram(vecs, len(codes))
# Show colors for each code in its hex value.
# colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
# total = scipy.sum(counts)
# print dict(zip(colors, [count/float(total) for count in counts]))
2015-12-02 12:15:56 +01:00
# Find the most frequent color, based on the counts.
index_max = scipy.argmax(counts)
peak = codes.astype(int)[index_max]
2020-07-01 16:59:21 -04:00
color = "{:02x}{:02x}{:02x}".format(peak[0], peak[1], peak[2])
2021-04-16 17:25:57 -04:00
color = self.feed.adjust_color(color[:6], 21)
2015-12-02 12:15:56 +01:00
2021-04-16 17:25:57 -04:00
return color
def string_from_image(self, image):
2020-07-01 11:57:51 -04:00
output = BytesIO()
image.save(output, 'png', quality=95)
contents = output.getvalue()
output.close()
2020-07-01 18:38:37 -04:00
return base64.b64encode(contents).decode()