From 00ba259c668af1321a8992be0eec1117659fecf2 Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Tue, 18 Sep 2012 17:09:07 -0700 Subject: [PATCH] Migrating to S3 for feed pages. --- apps/reader/views.py | 5 +++++ apps/rss_feeds/icon_importer.py | 8 ++++++++ apps/rss_feeds/models.py | 7 +++++++ apps/rss_feeds/page_importer.py | 36 ++++++++++++++++++++++++++++----- local_settings.py.template | 16 +++++++-------- settings.py | 23 +++++++++++++++++++++ 6 files changed, 82 insertions(+), 13 deletions(-) diff --git a/apps/reader/views.py b/apps/reader/views.py index 6ddb16911..93acda2db 100644 --- a/apps/reader/views.py +++ b/apps/reader/views.py @@ -564,6 +564,11 @@ def load_feed_page(request, feed_id): raise Http404 feed = Feed.get_by_id(feed_id) + + if feed.has_page and not feed.has_page_exception and feed.s3_page: + return HttpResponseRedirect('//%s/%s' % (settings.S3_PAGES_BUCKET_NAME, + feed.s3_pages_key)) + data = MFeedPage.get_data(feed_id=feed_id) if not data or not feed.has_page or feed.has_page_exception: diff --git a/apps/rss_feeds/icon_importer.py b/apps/rss_feeds/icon_importer.py index ee57bc77b..17a007bfc 100644 --- a/apps/rss_feeds/icon_importer.py +++ b/apps/rss_feeds/icon_importer.py @@ -6,8 +6,10 @@ import scipy.cluster import urlparse import struct import operator +import gzip import BmpImagePlugin, PngImagePlugin, Image from StringIO import StringIO +from django.conf import settings from apps.rss_feeds.models import MFeedPage, MFeedIcon from utils.feed_functions import timelimit, TimeoutError @@ -146,6 +148,12 @@ class IconImporter(object): image_file = None if self.page_data: content = self.page_data + elif settings.BACKED_BY_AWS.get('pages_on_s3') and self.feed.s3_page: + key = settings.S3_PAGES_BUCKET.get_key(self.feed.s3_pages_key) + compressed_content = key.get_contents_as_string() + stream = StringIO(compressed_content) + gz = gzip.GzipFile(fileobj=stream) + content = gz.read() else: content = MFeedPage.get_data(feed_id=self.feed.pk) url = self._url_from_html(content) diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index 080844988..6d2b84c0b 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -69,6 +69,8 @@ class Feed(models.Model): last_load_time = models.IntegerField(default=0) favicon_color = models.CharField(max_length=6, null=True, blank=True) favicon_not_found = models.BooleanField(default=False) + s3_page = models.NullBooleanField(default=False, blank=True, null=True) + s3_icon = models.NullBooleanField(default=False, blank=True, null=True) class Meta: db_table="feeds" @@ -95,6 +97,11 @@ class Feed(models.Model): Site.objects.get_current().domain, self.favicon_url ) + + @property + def s3_pages_key(self): + return "%s.gz.html" % self.pk + def canonical(self, full=False, include_favicon=True): feed = { 'id': self.pk, diff --git a/apps/rss_feeds/page_importer.py b/apps/rss_feeds/page_importer.py index 487976d27..46934e8f6 100644 --- a/apps/rss_feeds/page_importer.py +++ b/apps/rss_feeds/page_importer.py @@ -6,6 +6,9 @@ import feedparser import time import urllib2 import httplib +import gzip +import StringIO +from boto.s3.key import Key from django.conf import settings from utils import log as logging from apps.rss_feeds.models import MFeedPage @@ -169,9 +172,32 @@ class PageImporter(object): def save_page(self, html): if html and len(html) > 100: - feed_page, created = MFeedPage.objects.get_or_create(feed_id=self.feed.pk, - auto_save=True) - feed_page.page_data = html - feed_page.save() + if settings.BACKED_BY_AWS.get('pages_on_s3'): + k = Key(settings.S3_PAGES_BUCKET) + k.key = self.feed.s3_pages_key + k.set_metadata('Content-Encoding', 'gzip') + k.set_metadata('Content-Type', 'text/html') + k.set_metadata('Access-Control-Allow-Origin', '*') + out = StringIO.StringIO() + f = gzip.GzipFile(fileobj=out, mode='w') + f.write(html) + f.close() + compressed_html = out.getvalue() + k.set_contents_from_string(compressed_html) + k.set_acl('public-read') + + if False and not self.feed.s3_page: + try: + feed_page = MFeedPage.objects.get(feed_id=self.feed.pk) + feed_page.delete() + logging.debug(' --->> [%-30s] ~FYTransfering page data to S3...' % (self.feed)) + except MFeedPage.DoesNotExist: + pass - return feed_page + self.feed.s3_page = True + self.feed.save() + else: + feed_page, _ = MFeedPage.objects.get_or_create(feed_id=self.feed.pk) + feed_page.page_data = html + feed_page.save() + return feed_page diff --git a/local_settings.py.template b/local_settings.py.template index 9e2e982b9..df2f0220a 100644 --- a/local_settings.py.template +++ b/local_settings.py.template @@ -76,19 +76,19 @@ MONGODB_SLAVE = { 'host': '127.0.0.1' } -# Celery RabbitMQ Broker -BROKER_HOST = "127.0.0.1" +# Celery RabbitMQ/Redis Broker +CELERY_REDIS_HOST = "127.0.0.1" +BROKER_URL = "redis://127.0.0.1:6379/0" REDIS = { 'host': '127.0.0.1', } -# AMQP - RabbitMQ server -BROKER_HOST = "db01.newsblur.com" -BROKER_PORT = 5672 -BROKER_USER = "newsblur" -BROKER_PASSWORD = "newsblur" -BROKER_VHOST = "newsblurvhost" +BACKED_BY_AWS = { + 'pages_on_s3': False, + 'icons_on_s3': False, + 'stories_on_dynamodb': False, +} # =========== # = Logging = diff --git a/settings.py b/settings.py index 296235ac1..49180f461 100644 --- a/settings.py +++ b/settings.py @@ -3,6 +3,7 @@ import logging import os import datetime from mongoengine import connect +from boto.s3.connection import S3Connection import redis from utils import jammit @@ -409,6 +410,16 @@ FACEBOOK_SECRET = '99999999999999999999999999999999' TWITTER_CONSUMER_KEY = 'ooooooooooooooooooooo' TWITTER_CONSUMER_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' +# =============== +# = AWS Backing = +# =============== + +BACKED_BY_AWS = { + 'pages_on_s3': False, + 'icons_on_s3': False, + 'stories_on_dynamodb': False, +} + # ================== # = Configurations = # ================== @@ -424,6 +435,9 @@ TEMPLATE_DEBUG = DEBUG ACCOUNT_ACTIVATION_DAYS = 30 AWS_ACCESS_KEY_ID = S3_ACCESS_KEY AWS_SECRET_ACCESS_KEY = S3_SECRET +S3_BACKUP_BUCKET = 'newsblur_backups' +S3_PAGES_BUCKET_NAME = 'pages.newsblur.com' +S3_ICONS_BUCKET_NAME = 'icons.newsblur.com' def custom_show_toolbar(request): return DEBUG @@ -470,3 +484,12 @@ if DEBUG: MIDDLEWARE_CLASSES += ('utils.request_introspection_middleware.DumpRequestMiddleware',) MIDDLEWARE_CLASSES += ('utils.exception_middleware.ConsoleExceptionMiddleware',) +# ======= +# = AWS = +# ======= + +S3_CONN = None +if BACKED_BY_AWS.get('pages_on_s3') or BACKED_BY_AWS.get('icons_on_s3'): + S3_CONN = S3Connection(S3_ACCESS_KEY, S3_SECRET) + S3_PAGES_BUCKET = S3_CONN.get_bucket(S3_PAGES_BUCKET_NAME) + S3_ICONS_BUCKET = S3_CONN.get_bucket(S3_ICONS_BUCKET_NAME)