Migrating to S3 for feed pages.

This commit is contained in:
Samuel Clay 2012-09-18 17:09:07 -07:00
parent 801fe4f20d
commit 00ba259c66
6 changed files with 82 additions and 13 deletions

View file

@ -564,6 +564,11 @@ def load_feed_page(request, feed_id):
raise Http404 raise Http404
feed = Feed.get_by_id(feed_id) feed = Feed.get_by_id(feed_id)
if feed.has_page and not feed.has_page_exception and feed.s3_page:
return HttpResponseRedirect('//%s/%s' % (settings.S3_PAGES_BUCKET_NAME,
feed.s3_pages_key))
data = MFeedPage.get_data(feed_id=feed_id) data = MFeedPage.get_data(feed_id=feed_id)
if not data or not feed.has_page or feed.has_page_exception: if not data or not feed.has_page or feed.has_page_exception:

View file

@ -6,8 +6,10 @@ import scipy.cluster
import urlparse import urlparse
import struct import struct
import operator import operator
import gzip
import BmpImagePlugin, PngImagePlugin, Image import BmpImagePlugin, PngImagePlugin, Image
from StringIO import StringIO from StringIO import StringIO
from django.conf import settings
from apps.rss_feeds.models import MFeedPage, MFeedIcon from apps.rss_feeds.models import MFeedPage, MFeedIcon
from utils.feed_functions import timelimit, TimeoutError from utils.feed_functions import timelimit, TimeoutError
@ -146,6 +148,12 @@ class IconImporter(object):
image_file = None image_file = None
if self.page_data: if self.page_data:
content = self.page_data content = self.page_data
elif settings.BACKED_BY_AWS.get('pages_on_s3') and self.feed.s3_page:
key = settings.S3_PAGES_BUCKET.get_key(self.feed.s3_pages_key)
compressed_content = key.get_contents_as_string()
stream = StringIO(compressed_content)
gz = gzip.GzipFile(fileobj=stream)
content = gz.read()
else: else:
content = MFeedPage.get_data(feed_id=self.feed.pk) content = MFeedPage.get_data(feed_id=self.feed.pk)
url = self._url_from_html(content) url = self._url_from_html(content)

View file

@ -69,6 +69,8 @@ class Feed(models.Model):
last_load_time = models.IntegerField(default=0) last_load_time = models.IntegerField(default=0)
favicon_color = models.CharField(max_length=6, null=True, blank=True) favicon_color = models.CharField(max_length=6, null=True, blank=True)
favicon_not_found = models.BooleanField(default=False) favicon_not_found = models.BooleanField(default=False)
s3_page = models.NullBooleanField(default=False, blank=True, null=True)
s3_icon = models.NullBooleanField(default=False, blank=True, null=True)
class Meta: class Meta:
db_table="feeds" db_table="feeds"
@ -95,6 +97,11 @@ class Feed(models.Model):
Site.objects.get_current().domain, Site.objects.get_current().domain,
self.favicon_url self.favicon_url
) )
@property
def s3_pages_key(self):
return "%s.gz.html" % self.pk
def canonical(self, full=False, include_favicon=True): def canonical(self, full=False, include_favicon=True):
feed = { feed = {
'id': self.pk, 'id': self.pk,

View file

@ -6,6 +6,9 @@ import feedparser
import time import time
import urllib2 import urllib2
import httplib import httplib
import gzip
import StringIO
from boto.s3.key import Key
from django.conf import settings from django.conf import settings
from utils import log as logging from utils import log as logging
from apps.rss_feeds.models import MFeedPage from apps.rss_feeds.models import MFeedPage
@ -169,9 +172,32 @@ class PageImporter(object):
def save_page(self, html): def save_page(self, html):
if html and len(html) > 100: if html and len(html) > 100:
feed_page, created = MFeedPage.objects.get_or_create(feed_id=self.feed.pk, if settings.BACKED_BY_AWS.get('pages_on_s3'):
auto_save=True) k = Key(settings.S3_PAGES_BUCKET)
k.key = self.feed.s3_pages_key
k.set_metadata('Content-Encoding', 'gzip')
k.set_metadata('Content-Type', 'text/html')
k.set_metadata('Access-Control-Allow-Origin', '*')
out = StringIO.StringIO()
f = gzip.GzipFile(fileobj=out, mode='w')
f.write(html)
f.close()
compressed_html = out.getvalue()
k.set_contents_from_string(compressed_html)
k.set_acl('public-read')
if False and not self.feed.s3_page:
try:
feed_page = MFeedPage.objects.get(feed_id=self.feed.pk)
feed_page.delete()
logging.debug(' --->> [%-30s] ~FYTransfering page data to S3...' % (self.feed))
except MFeedPage.DoesNotExist:
pass
self.feed.s3_page = True
self.feed.save()
else:
feed_page, _ = MFeedPage.objects.get_or_create(feed_id=self.feed.pk)
feed_page.page_data = html feed_page.page_data = html
feed_page.save() feed_page.save()
return feed_page return feed_page

View file

@ -76,19 +76,19 @@ MONGODB_SLAVE = {
'host': '127.0.0.1' 'host': '127.0.0.1'
} }
# Celery RabbitMQ Broker # Celery RabbitMQ/Redis Broker
BROKER_HOST = "127.0.0.1" CELERY_REDIS_HOST = "127.0.0.1"
BROKER_URL = "redis://127.0.0.1:6379/0"
REDIS = { REDIS = {
'host': '127.0.0.1', 'host': '127.0.0.1',
} }
# AMQP - RabbitMQ server BACKED_BY_AWS = {
BROKER_HOST = "db01.newsblur.com" 'pages_on_s3': False,
BROKER_PORT = 5672 'icons_on_s3': False,
BROKER_USER = "newsblur" 'stories_on_dynamodb': False,
BROKER_PASSWORD = "newsblur" }
BROKER_VHOST = "newsblurvhost"
# =========== # ===========
# = Logging = # = Logging =

View file

@ -3,6 +3,7 @@ import logging
import os import os
import datetime import datetime
from mongoengine import connect from mongoengine import connect
from boto.s3.connection import S3Connection
import redis import redis
from utils import jammit from utils import jammit
@ -409,6 +410,16 @@ FACEBOOK_SECRET = '99999999999999999999999999999999'
TWITTER_CONSUMER_KEY = 'ooooooooooooooooooooo' TWITTER_CONSUMER_KEY = 'ooooooooooooooooooooo'
TWITTER_CONSUMER_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' TWITTER_CONSUMER_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
# ===============
# = AWS Backing =
# ===============
BACKED_BY_AWS = {
'pages_on_s3': False,
'icons_on_s3': False,
'stories_on_dynamodb': False,
}
# ================== # ==================
# = Configurations = # = Configurations =
# ================== # ==================
@ -424,6 +435,9 @@ TEMPLATE_DEBUG = DEBUG
ACCOUNT_ACTIVATION_DAYS = 30 ACCOUNT_ACTIVATION_DAYS = 30
AWS_ACCESS_KEY_ID = S3_ACCESS_KEY AWS_ACCESS_KEY_ID = S3_ACCESS_KEY
AWS_SECRET_ACCESS_KEY = S3_SECRET AWS_SECRET_ACCESS_KEY = S3_SECRET
S3_BACKUP_BUCKET = 'newsblur_backups'
S3_PAGES_BUCKET_NAME = 'pages.newsblur.com'
S3_ICONS_BUCKET_NAME = 'icons.newsblur.com'
def custom_show_toolbar(request): def custom_show_toolbar(request):
return DEBUG return DEBUG
@ -470,3 +484,12 @@ if DEBUG:
MIDDLEWARE_CLASSES += ('utils.request_introspection_middleware.DumpRequestMiddleware',) MIDDLEWARE_CLASSES += ('utils.request_introspection_middleware.DumpRequestMiddleware',)
MIDDLEWARE_CLASSES += ('utils.exception_middleware.ConsoleExceptionMiddleware',) MIDDLEWARE_CLASSES += ('utils.exception_middleware.ConsoleExceptionMiddleware',)
# =======
# = AWS =
# =======
S3_CONN = None
if BACKED_BY_AWS.get('pages_on_s3') or BACKED_BY_AWS.get('icons_on_s3'):
S3_CONN = S3Connection(S3_ACCESS_KEY, S3_SECRET)
S3_PAGES_BUCKET = S3_CONN.get_bucket(S3_PAGES_BUCKET_NAME)
S3_ICONS_BUCKET = S3_CONN.get_bucket(S3_ICONS_BUCKET_NAME)