mirror of
https://github.com/viq/NewsBlur.git
synced 2025-08-05 16:49:45 +00:00

* master: (27 commits) Removing log override Moving logging over to the newsblur log. Fixing search indexer background task for new celery. Attempting to add gunicorn errors to console/log. Better handling of missing subs. Handling missing user sub on feed delete. Correct encoding for strings on systems that don't have utf-8 as default encoding. Writing in the real urllib3 dependency for requests. Upgrading requests due to urllib3 incompatibility. Login required should use the next parameter. Upgrading django oauth toolkit for django 1.11. Handling newsletters with multiple recipients. Extracting image urls sometimes fails. Handling ajax errors in json views. Adding timeouts to most outbound requests. Sentry SDK 0.19.4. Removing imperfect proxy warning for every story. Found four more GET/POST crosses. Feed unread count may need a POST. Namespacing settings. ...
393 lines
14 KiB
Python
393 lines
14 KiB
Python
import urllib.request
|
|
import urllib.error
|
|
import urllib.parse
|
|
import lxml.html
|
|
import numpy
|
|
import scipy
|
|
import scipy.misc
|
|
import scipy.cluster
|
|
import struct
|
|
import operator
|
|
import gzip
|
|
import datetime
|
|
import requests
|
|
import base64
|
|
import http.client
|
|
from PIL import BmpImagePlugin, PngImagePlugin, Image
|
|
from socket import error as SocketError
|
|
from boto.s3.key import Key
|
|
from io import BytesIO
|
|
from django.conf import settings
|
|
from apps.rss_feeds.models import MFeedPage, MFeedIcon
|
|
from utils.facebook_fetcher import FacebookFetcher
|
|
from utils import log as logging
|
|
from utils.feed_functions import timelimit, TimeoutError
|
|
from OpenSSL.SSL import Error as OpenSSLError
|
|
from pyasn1.error import PyAsn1Error
|
|
from requests.packages.urllib3.exceptions import LocationParseError
|
|
|
|
|
|
class IconImporter(object):
|
|
|
|
def __init__(self, feed, page_data=None, force=False):
|
|
self.feed = feed
|
|
self.force = force
|
|
self.page_data = page_data
|
|
self.feed_icon = MFeedIcon.get_feed(feed_id=self.feed.pk)
|
|
|
|
def save(self):
|
|
if not self.force and self.feed.favicon_not_found:
|
|
# print 'Not found, skipping...'
|
|
return
|
|
if (
|
|
not self.force
|
|
and not self.feed.favicon_not_found
|
|
and self.feed_icon.icon_url
|
|
and self.feed.s3_icon
|
|
):
|
|
# print 'Found, but skipping...'
|
|
return
|
|
if 'facebook.com' in self.feed.feed_address:
|
|
image, image_file, icon_url = self.fetch_facebook_image()
|
|
else:
|
|
image, image_file, icon_url = self.fetch_image_from_page_data()
|
|
if not image:
|
|
image, image_file, icon_url = self.fetch_image_from_path(force=self.force)
|
|
|
|
if image:
|
|
image = self.normalize_image(image)
|
|
try:
|
|
color = self.determine_dominant_color_in_image(image)
|
|
except (IndexError, ValueError, MemoryError):
|
|
logging.debug(" ---> [%-30s] ~SN~FRFailed to measure icon" % self.feed.log_title[:30])
|
|
return
|
|
try:
|
|
image_str = self.string_from_image(image)
|
|
except TypeError:
|
|
return
|
|
|
|
if len(image_str) > 500000:
|
|
image = None
|
|
if (image and
|
|
(self.force or
|
|
self.feed_icon.data != image_str or
|
|
self.feed_icon.icon_url != icon_url or
|
|
self.feed_icon.not_found or
|
|
(settings.BACKED_BY_AWS.get('icons_on_s3') and not self.feed.s3_icon))):
|
|
logging.debug(" ---> [%-30s] ~SN~FBIcon difference:~FY color:%s (%s/%s) data:%s url:%s notfound:%s no-s3:%s" % (
|
|
self.feed.log_title[:30],
|
|
self.feed_icon.color != color, self.feed_icon.color, color,
|
|
self.feed_icon.data != image_str,
|
|
self.feed_icon.icon_url != icon_url,
|
|
self.feed_icon.not_found,
|
|
settings.BACKED_BY_AWS.get('icons_on_s3') and not self.feed.s3_icon))
|
|
self.feed_icon.data = image_str
|
|
self.feed_icon.icon_url = icon_url
|
|
self.feed_icon.color = color
|
|
self.feed_icon.not_found = False
|
|
self.feed_icon.save()
|
|
if settings.BACKED_BY_AWS.get('icons_on_s3'):
|
|
self.save_to_s3(image_str)
|
|
if self.feed.favicon_color != color:
|
|
self.feed.favicon_color = color
|
|
self.feed.favicon_not_found = False
|
|
self.feed.save(update_fields=['favicon_color', 'favicon_not_found'])
|
|
|
|
if not image:
|
|
self.feed_icon.not_found = True
|
|
self.feed_icon.save()
|
|
self.feed.favicon_not_found = True
|
|
self.feed.save()
|
|
|
|
return not self.feed.favicon_not_found
|
|
|
|
def save_to_s3(self, image_str):
|
|
expires = datetime.datetime.now() + datetime.timedelta(days=60)
|
|
expires = expires.strftime("%a, %d %b %Y %H:%M:%S GMT")
|
|
k = Key(settings.S3_CONN.get_bucket(settings.S3_ICONS_BUCKET_NAME))
|
|
k.key = self.feed.s3_icons_key
|
|
k.set_metadata('Content-Type', 'image/png')
|
|
k.set_metadata('Expires', expires)
|
|
k.set_contents_from_string(base64.b64decode(image_str))
|
|
k.set_acl('public-read')
|
|
|
|
self.feed.s3_icon = True
|
|
self.feed.save()
|
|
|
|
def load_icon(self, image_file, index=None):
|
|
'''
|
|
DEPRECATED
|
|
|
|
Load Windows ICO image.
|
|
|
|
See http://en.wikipedia.org/w/index.php?oldid=264332061 for file format
|
|
description.
|
|
|
|
Cribbed and modified from http://djangosnippets.org/snippets/1287/
|
|
'''
|
|
try:
|
|
image_file.seek(0)
|
|
header = struct.unpack('<3H', image_file.read(6))
|
|
except Exception:
|
|
return
|
|
|
|
# Check magic
|
|
if header[:2] != (0, 1):
|
|
return
|
|
|
|
# Collect icon directories
|
|
directories = []
|
|
for i in range(header[2]):
|
|
directory = list(struct.unpack('<4B2H2I', image_file.read(16)))
|
|
for j in range(3):
|
|
if not directory[j]:
|
|
directory[j] = 256
|
|
|
|
directories.append(directory)
|
|
|
|
if index is None:
|
|
# Select best icon
|
|
directory = max(directories, key=operator.itemgetter(slice(0, 3)))
|
|
else:
|
|
directory = directories[index]
|
|
|
|
# Seek to the bitmap data
|
|
image_file.seek(directory[7])
|
|
|
|
prefix = image_file.read(16)
|
|
image_file.seek(-16, 1)
|
|
|
|
if PngImagePlugin._accept(prefix):
|
|
# Windows Vista icon with PNG inside
|
|
try:
|
|
image = PngImagePlugin.PngImageFile(image_file)
|
|
except IOError:
|
|
return
|
|
else:
|
|
# Load XOR bitmap
|
|
try:
|
|
image = BmpImagePlugin.DibImageFile(image_file)
|
|
except IOError:
|
|
return
|
|
if image.mode == 'RGBA':
|
|
# Windows XP 32-bit color depth icon without AND bitmap
|
|
pass
|
|
else:
|
|
# Patch up the bitmap height
|
|
image.size = image.size[0], image.size[1] >> 1
|
|
d, e, o, a = image.tile[0]
|
|
image.tile[0] = d, (0, 0) + image.size, o, a
|
|
|
|
# Calculate AND bitmap dimensions. See
|
|
# http://en.wikipedia.org/w/index.php?oldid=264236948#Pixel_storage
|
|
# for description
|
|
offset = o + a[1] * image.size[1]
|
|
stride = ((image.size[0] + 31) >> 5) << 2
|
|
size = stride * image.size[1]
|
|
|
|
# Load AND bitmap
|
|
image_file.seek(offset)
|
|
string = image_file.read(size)
|
|
mask = Image.frombytes('1', image.size, string, 'raw',
|
|
('1;I', stride, -1))
|
|
|
|
image = image.convert('RGBA')
|
|
image.putalpha(mask)
|
|
|
|
return image
|
|
|
|
def fetch_image_from_page_data(self):
|
|
image = None
|
|
image_file = None
|
|
if self.page_data:
|
|
content = self.page_data
|
|
elif settings.BACKED_BY_AWS.get('pages_on_s3') and self.feed.s3_page:
|
|
key = settings.S3_CONN.get_bucket(settings.S3_PAGES_BUCKET_NAME).get_key(self.feed.s3_pages_key)
|
|
compressed_content = key.get_contents_as_string()
|
|
stream = BytesIO(compressed_content)
|
|
gz = gzip.GzipFile(fileobj=stream)
|
|
try:
|
|
content = gz.read()
|
|
except IOError:
|
|
content = None
|
|
else:
|
|
content = MFeedPage.get_data(feed_id=self.feed.pk)
|
|
url = self._url_from_html(content)
|
|
if not url:
|
|
try:
|
|
content = requests.get(self.cleaned_feed_link, timeout=10).content
|
|
url = self._url_from_html(content)
|
|
except (AttributeError, SocketError, requests.ConnectionError,
|
|
requests.models.MissingSchema, requests.sessions.InvalidSchema,
|
|
requests.sessions.TooManyRedirects,
|
|
requests.models.InvalidURL,
|
|
requests.models.ChunkedEncodingError,
|
|
requests.models.ContentDecodingError,
|
|
http.client.IncompleteRead,
|
|
requests.adapters.ReadTimeout,
|
|
LocationParseError, OpenSSLError, PyAsn1Error,
|
|
ValueError) as e:
|
|
logging.debug(" ---> ~SN~FRFailed~FY to fetch ~FGfeed icon~FY: %s" % e)
|
|
if url:
|
|
image, image_file = self.get_image_from_url(url)
|
|
return image, image_file, url
|
|
|
|
@property
|
|
def cleaned_feed_link(self):
|
|
if self.feed.feed_link.startswith('http'):
|
|
return self.feed.feed_link
|
|
return 'http://' + self.feed.feed_link
|
|
|
|
def fetch_image_from_path(self, path='favicon.ico', force=False):
|
|
image = None
|
|
url = None
|
|
|
|
if not force:
|
|
url = self.feed_icon.icon_url
|
|
if not url and self.feed.feed_link and len(self.feed.feed_link) > 6:
|
|
try:
|
|
url = urllib.parse.urljoin(self.feed.feed_link, 'favicon.ico')
|
|
except ValueError:
|
|
url = None
|
|
if not url:
|
|
return None, None, None
|
|
|
|
image, image_file = self.get_image_from_url(url)
|
|
if not image:
|
|
url = urllib.parse.urljoin(self.feed.feed_link, '/favicon.ico')
|
|
image, image_file = self.get_image_from_url(url)
|
|
# print 'Found: %s - %s' % (url, image)
|
|
return image, image_file, url
|
|
|
|
def fetch_facebook_image(self):
|
|
facebook_fetcher = FacebookFetcher(self.feed)
|
|
url = facebook_fetcher.favicon_url()
|
|
image, image_file = self.get_image_from_url(url)
|
|
if not image:
|
|
url = urllib.parse.urljoin(self.feed.feed_link, '/favicon.ico')
|
|
image, image_file = self.get_image_from_url(url)
|
|
# print 'Found: %s - %s' % (url, image)
|
|
return image, image_file, url
|
|
|
|
def get_image_from_url(self, url):
|
|
# print 'Requesting: %s' % url
|
|
if not url:
|
|
return None, None
|
|
|
|
@timelimit(30)
|
|
def _1(url):
|
|
headers = {
|
|
'User-Agent': 'NewsBlur Favicon Fetcher - %s subscriber%s - %s %s' %
|
|
(
|
|
self.feed.num_subscribers,
|
|
's' if self.feed.num_subscribers != 1 else '',
|
|
self.feed.permalink,
|
|
self.feed.fake_user_agent,
|
|
),
|
|
'Connection': 'close',
|
|
'Accept': 'image/png,image/x-icon,image/*;q=0.9,*/*;q=0.8'
|
|
}
|
|
try:
|
|
request = urllib.request.Request(url, headers=headers)
|
|
icon = urllib.request.urlopen(request).read()
|
|
except Exception:
|
|
return None
|
|
return icon
|
|
try:
|
|
icon = _1(url)
|
|
except TimeoutError:
|
|
return None, None
|
|
|
|
try:
|
|
icon_file = BytesIO(icon)
|
|
image = Image.open(icon_file)
|
|
except (IOError, ValueError):
|
|
return None, None
|
|
|
|
return image, icon_file
|
|
|
|
def _url_from_html(self, content):
|
|
url = None
|
|
if not content:
|
|
return url
|
|
try:
|
|
if isinstance(content, str):
|
|
content = content.encode('utf-8')
|
|
icon_path = lxml.html.fromstring(content).xpath(
|
|
'//link[@rel="icon" or @rel="shortcut icon"]/@href'
|
|
)
|
|
except (lxml.etree.ParserError, TypeError):
|
|
return url
|
|
|
|
if icon_path:
|
|
if str(icon_path[0]).startswith('http'):
|
|
url = icon_path[0]
|
|
else:
|
|
url = urllib.parse.urljoin(self.feed.feed_link, icon_path[0])
|
|
return url
|
|
|
|
def normalize_image(self, image):
|
|
# if image.size != (16, 16):
|
|
# image = image.resize((16, 16), Image.BICUBIC)
|
|
if image.mode != 'RGBA':
|
|
try:
|
|
image = image.convert('RGBA')
|
|
except IOError:
|
|
pass
|
|
|
|
return image
|
|
|
|
def determine_dominant_color_in_image(self, image):
|
|
NUM_CLUSTERS = 5
|
|
|
|
# Convert image into array of values for each point.
|
|
if image.mode == '1':
|
|
image.convert('L')
|
|
ar = numpy.array(image)
|
|
# ar = scipy.misc.fromimage(image)
|
|
shape = ar.shape
|
|
|
|
# Reshape array of values to merge color bands. [[R], [G], [B], [A]] => [R, G, B, A]
|
|
if len(shape) > 2:
|
|
ar = ar.reshape(scipy.product(shape[:2]), shape[2])
|
|
|
|
# Get NUM_CLUSTERS worth of centroids.
|
|
ar = ar.astype(numpy.float)
|
|
codes, _ = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
|
|
|
|
# Pare centroids, removing blacks and whites and shades of really dark and really light.
|
|
original_codes = codes
|
|
for low, hi in [(60, 200), (35, 230), (10, 250)]:
|
|
codes = scipy.array([code for code in codes
|
|
if not ((code[0] < low and code[1] < low and code[2] < low) or
|
|
(code[0] > hi and code[1] > hi and code[2] > hi))])
|
|
if not len(codes):
|
|
codes = original_codes
|
|
else:
|
|
break
|
|
|
|
# Assign codes (vector quantization). Each vector is compared to the centroids
|
|
# and assigned the nearest one.
|
|
vecs, _ = scipy.cluster.vq.vq(ar, codes)
|
|
|
|
# Count occurences of each clustered vector.
|
|
counts, bins = scipy.histogram(vecs, len(codes))
|
|
|
|
# Show colors for each code in its hex value.
|
|
# colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]
|
|
# total = scipy.sum(counts)
|
|
# print dict(zip(colors, [count/float(total) for count in counts]))
|
|
|
|
# Find the most frequent color, based on the counts.
|
|
index_max = scipy.argmax(counts)
|
|
peak = codes.astype(int)[index_max]
|
|
color = "{:02x}{:02x}{:02x}".format(peak[0], peak[1], peak[2])
|
|
|
|
return color[:6]
|
|
|
|
def string_from_image(self, image):
|
|
output = BytesIO()
|
|
image.save(output, 'png', quality=95)
|
|
contents = output.getvalue()
|
|
output.close()
|
|
return base64.b64encode(contents).decode()
|