Migrating to S3 for feed pages.

This commit is contained in:
Samuel Clay 2012-09-18 17:09:07 -07:00
parent 801fe4f20d
commit 00ba259c66
6 changed files with 82 additions and 13 deletions

View file

@ -564,6 +564,11 @@ def load_feed_page(request, feed_id):
raise Http404
feed = Feed.get_by_id(feed_id)
if feed.has_page and not feed.has_page_exception and feed.s3_page:
return HttpResponseRedirect('//%s/%s' % (settings.S3_PAGES_BUCKET_NAME,
feed.s3_pages_key))
data = MFeedPage.get_data(feed_id=feed_id)
if not data or not feed.has_page or feed.has_page_exception:

View file

@ -6,8 +6,10 @@ import scipy.cluster
import urlparse
import struct
import operator
import gzip
import BmpImagePlugin, PngImagePlugin, Image
from StringIO import StringIO
from django.conf import settings
from apps.rss_feeds.models import MFeedPage, MFeedIcon
from utils.feed_functions import timelimit, TimeoutError
@ -146,6 +148,12 @@ class IconImporter(object):
image_file = None
if self.page_data:
content = self.page_data
elif settings.BACKED_BY_AWS.get('pages_on_s3') and self.feed.s3_page:
key = settings.S3_PAGES_BUCKET.get_key(self.feed.s3_pages_key)
compressed_content = key.get_contents_as_string()
stream = StringIO(compressed_content)
gz = gzip.GzipFile(fileobj=stream)
content = gz.read()
else:
content = MFeedPage.get_data(feed_id=self.feed.pk)
url = self._url_from_html(content)

View file

@ -69,6 +69,8 @@ class Feed(models.Model):
last_load_time = models.IntegerField(default=0)
favicon_color = models.CharField(max_length=6, null=True, blank=True)
favicon_not_found = models.BooleanField(default=False)
s3_page = models.NullBooleanField(default=False, blank=True, null=True)
s3_icon = models.NullBooleanField(default=False, blank=True, null=True)
class Meta:
db_table="feeds"
@ -95,6 +97,11 @@ class Feed(models.Model):
Site.objects.get_current().domain,
self.favicon_url
)
@property
def s3_pages_key(self):
return "%s.gz.html" % self.pk
def canonical(self, full=False, include_favicon=True):
feed = {
'id': self.pk,

View file

@ -6,6 +6,9 @@ import feedparser
import time
import urllib2
import httplib
import gzip
import StringIO
from boto.s3.key import Key
from django.conf import settings
from utils import log as logging
from apps.rss_feeds.models import MFeedPage
@ -169,9 +172,32 @@ class PageImporter(object):
def save_page(self, html):
if html and len(html) > 100:
feed_page, created = MFeedPage.objects.get_or_create(feed_id=self.feed.pk,
auto_save=True)
feed_page.page_data = html
feed_page.save()
if settings.BACKED_BY_AWS.get('pages_on_s3'):
k = Key(settings.S3_PAGES_BUCKET)
k.key = self.feed.s3_pages_key
k.set_metadata('Content-Encoding', 'gzip')
k.set_metadata('Content-Type', 'text/html')
k.set_metadata('Access-Control-Allow-Origin', '*')
out = StringIO.StringIO()
f = gzip.GzipFile(fileobj=out, mode='w')
f.write(html)
f.close()
compressed_html = out.getvalue()
k.set_contents_from_string(compressed_html)
k.set_acl('public-read')
if False and not self.feed.s3_page:
try:
feed_page = MFeedPage.objects.get(feed_id=self.feed.pk)
feed_page.delete()
logging.debug(' --->> [%-30s] ~FYTransfering page data to S3...' % (self.feed))
except MFeedPage.DoesNotExist:
pass
return feed_page
self.feed.s3_page = True
self.feed.save()
else:
feed_page, _ = MFeedPage.objects.get_or_create(feed_id=self.feed.pk)
feed_page.page_data = html
feed_page.save()
return feed_page

View file

@ -76,19 +76,19 @@ MONGODB_SLAVE = {
'host': '127.0.0.1'
}
# Celery RabbitMQ Broker
BROKER_HOST = "127.0.0.1"
# Celery RabbitMQ/Redis Broker
CELERY_REDIS_HOST = "127.0.0.1"
BROKER_URL = "redis://127.0.0.1:6379/0"
REDIS = {
'host': '127.0.0.1',
}
# AMQP - RabbitMQ server
BROKER_HOST = "db01.newsblur.com"
BROKER_PORT = 5672
BROKER_USER = "newsblur"
BROKER_PASSWORD = "newsblur"
BROKER_VHOST = "newsblurvhost"
BACKED_BY_AWS = {
'pages_on_s3': False,
'icons_on_s3': False,
'stories_on_dynamodb': False,
}
# ===========
# = Logging =

View file

@ -3,6 +3,7 @@ import logging
import os
import datetime
from mongoengine import connect
from boto.s3.connection import S3Connection
import redis
from utils import jammit
@ -409,6 +410,16 @@ FACEBOOK_SECRET = '99999999999999999999999999999999'
TWITTER_CONSUMER_KEY = 'ooooooooooooooooooooo'
TWITTER_CONSUMER_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
# ===============
# = AWS Backing =
# ===============
BACKED_BY_AWS = {
'pages_on_s3': False,
'icons_on_s3': False,
'stories_on_dynamodb': False,
}
# ==================
# = Configurations =
# ==================
@ -424,6 +435,9 @@ TEMPLATE_DEBUG = DEBUG
ACCOUNT_ACTIVATION_DAYS = 30
AWS_ACCESS_KEY_ID = S3_ACCESS_KEY
AWS_SECRET_ACCESS_KEY = S3_SECRET
S3_BACKUP_BUCKET = 'newsblur_backups'
S3_PAGES_BUCKET_NAME = 'pages.newsblur.com'
S3_ICONS_BUCKET_NAME = 'icons.newsblur.com'
def custom_show_toolbar(request):
return DEBUG
@ -470,3 +484,12 @@ if DEBUG:
MIDDLEWARE_CLASSES += ('utils.request_introspection_middleware.DumpRequestMiddleware',)
MIDDLEWARE_CLASSES += ('utils.exception_middleware.ConsoleExceptionMiddleware',)
# =======
# = AWS =
# =======
S3_CONN = None
if BACKED_BY_AWS.get('pages_on_s3') or BACKED_BY_AWS.get('icons_on_s3'):
S3_CONN = S3Connection(S3_ACCESS_KEY, S3_SECRET)
S3_PAGES_BUCKET = S3_CONN.get_bucket(S3_PAGES_BUCKET_NAME)
S3_ICONS_BUCKET = S3_CONN.get_bucket(S3_ICONS_BUCKET_NAME)