mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-05 16:58:59 +00:00
265 lines
10 KiB
Python
265 lines
10 KiB
Python
"""
|
|
Sync Media to S3
|
|
================
|
|
|
|
Django command that scans all files in your settings.MEDIA_ROOT folder and
|
|
uploads them to S3 with the same directory structure.
|
|
|
|
This command can optionally do the following but it is off by default:
|
|
* gzip compress any CSS and Javascript files it finds and adds the appropriate
|
|
'Content-Encoding' header.
|
|
* set a far future 'Expires' header for optimal caching.
|
|
|
|
Note: This script requires the Python boto library and valid Amazon Web
|
|
Services API keys.
|
|
|
|
Required settings.py variables:
|
|
AWS_ACCESS_KEY_ID = ''
|
|
AWS_SECRET_ACCESS_KEY = ''
|
|
AWS_BUCKET_NAME = ''
|
|
|
|
Command options are:
|
|
-p PREFIX, --prefix=PREFIX
|
|
The prefix to prepend to the path on S3.
|
|
--gzip Enables gzipping CSS and Javascript files.
|
|
--expires Enables setting a far future expires header.
|
|
--force Skip the file mtime check to force upload of all
|
|
files.
|
|
--filter-list Override default directory and file exclusion
|
|
filters. (enter as comma seperated line)
|
|
|
|
TODO:
|
|
* Use fnmatch (or regex) to allow more complex FILTER_LIST rules.
|
|
|
|
"""
|
|
import datetime
|
|
import email
|
|
import mimetypes
|
|
import optparse
|
|
import os
|
|
import sys
|
|
import time
|
|
|
|
from django.conf import settings
|
|
from django.core.management.base import BaseCommand, CommandError
|
|
|
|
# Make sure boto is available
|
|
try:
|
|
import boto
|
|
import boto.exception
|
|
except ImportError:
|
|
raise ImportError, "The boto Python library is not installed."
|
|
|
|
class Command(BaseCommand):
|
|
|
|
# Extra variables to avoid passing these around
|
|
AWS_ACCESS_KEY_ID = ''
|
|
AWS_SECRET_ACCESS_KEY = ''
|
|
AWS_BUCKET_NAME = ''
|
|
DIRECTORY = ''
|
|
FILTER_LIST = ['.DS_Store', '.svn', '.hg', '.git', 'Thumbs.db']
|
|
GZIP_CONTENT_TYPES = (
|
|
'text/css',
|
|
'application/javascript',
|
|
'application/x-javascript'
|
|
)
|
|
|
|
upload_count = 0
|
|
skip_count = 0
|
|
|
|
option_list = BaseCommand.option_list + (
|
|
optparse.make_option('-p', '--prefix',
|
|
dest='prefix', default='',
|
|
help="The prefix to prepend to the path on S3."),
|
|
optparse.make_option('-d', '--dir',
|
|
dest='dir', default=settings.MEDIA_ROOT,
|
|
help="The root directory to use instead of your MEDIA_ROOT"),
|
|
optparse.make_option('--gzip',
|
|
action='store_true', dest='gzip', default=False,
|
|
help="Enables gzipping CSS and Javascript files."),
|
|
optparse.make_option('--expires',
|
|
action='store_true', dest='expires', default=False,
|
|
help="Enables setting a far future expires header."),
|
|
optparse.make_option('--force',
|
|
action='store_true', dest='force', default=False,
|
|
help="Skip the file mtime check to force upload of all files."),
|
|
optparse.make_option('--filter-list', dest='filter_list',
|
|
action='store', default='',
|
|
help="Override default directory and file exclusion filters. (enter as comma seperated line)"),
|
|
)
|
|
|
|
help = 'Syncs the complete MEDIA_ROOT structure and files to S3 into the given bucket name.'
|
|
args = 'bucket_name'
|
|
|
|
can_import_settings = True
|
|
|
|
def handle(self, *args, **options):
|
|
|
|
# Check for AWS keys in settings
|
|
if not hasattr(settings, 'AWS_ACCESS_KEY_ID') or \
|
|
not hasattr(settings, 'AWS_SECRET_ACCESS_KEY'):
|
|
raise CommandError('Missing AWS keys from settings file. Please' +
|
|
'supply both AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.')
|
|
else:
|
|
self.AWS_ACCESS_KEY_ID = settings.AWS_ACCESS_KEY_ID
|
|
self.AWS_SECRET_ACCESS_KEY = settings.AWS_SECRET_ACCESS_KEY
|
|
|
|
if not hasattr(settings, 'AWS_BUCKET_NAME'):
|
|
raise CommandError('Missing bucket name from settings file. Please' +
|
|
' add the AWS_BUCKET_NAME to your settings file.')
|
|
else:
|
|
if not settings.AWS_BUCKET_NAME:
|
|
raise CommandError('AWS_BUCKET_NAME cannot be empty.')
|
|
self.AWS_BUCKET_NAME = settings.AWS_BUCKET_NAME
|
|
|
|
if not hasattr(settings, 'MEDIA_ROOT'):
|
|
raise CommandError('MEDIA_ROOT must be set in your settings.')
|
|
else:
|
|
if not settings.MEDIA_ROOT:
|
|
raise CommandError('MEDIA_ROOT must be set in your settings.')
|
|
|
|
self.verbosity = int(options.get('verbosity'))
|
|
self.prefix = options.get('prefix')
|
|
self.do_gzip = options.get('gzip')
|
|
self.do_expires = options.get('expires')
|
|
self.do_force = options.get('force')
|
|
self.DIRECTORY = options.get('dir')
|
|
self.FILTER_LIST = getattr(settings, 'FILTER_LIST', self.FILTER_LIST)
|
|
filter_list = options.get('filter_list').split(',')
|
|
if filter_list:
|
|
# command line option overrides default filter_list and
|
|
# settings.filter_list
|
|
self.FILTER_LIST = filter_list
|
|
|
|
# Now call the syncing method to walk the MEDIA_ROOT directory and
|
|
# upload all files found.
|
|
self.sync_s3()
|
|
|
|
print
|
|
print "%d files uploaded." % (self.upload_count)
|
|
print "%d files skipped." % (self.skip_count)
|
|
|
|
def sync_s3(self):
|
|
"""
|
|
Walks the media directory and syncs files to S3
|
|
"""
|
|
bucket, key = self.open_s3()
|
|
os.path.walk(self.DIRECTORY, self.upload_s3,
|
|
(bucket, key, self.AWS_BUCKET_NAME, self.DIRECTORY))
|
|
|
|
def compress_string(self, s):
|
|
"""Gzip a given string."""
|
|
import cStringIO, gzip
|
|
zbuf = cStringIO.StringIO()
|
|
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
|
|
zfile.write(s)
|
|
zfile.close()
|
|
return zbuf.getvalue()
|
|
|
|
def open_s3(self):
|
|
"""
|
|
Opens connection to S3 returning bucket and key
|
|
"""
|
|
conn = boto.connect_s3(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY)
|
|
try:
|
|
bucket = conn.get_bucket(self.AWS_BUCKET_NAME)
|
|
except boto.exception.S3ResponseError:
|
|
bucket = conn.create_bucket(self.AWS_BUCKET_NAME)
|
|
return bucket, boto.s3.key.Key(bucket)
|
|
|
|
def upload_s3(self, arg, dirname, names):
|
|
"""
|
|
This is the callback to os.path.walk and where much of the work happens
|
|
"""
|
|
bucket, key, bucket_name, root_dir = arg # expand arg tuple
|
|
|
|
# Skip directories we don't want to sync
|
|
if os.path.basename(dirname) in self.FILTER_LIST:
|
|
# prevent walk from processing subfiles/subdirs below the ignored one
|
|
del names[:]
|
|
return
|
|
|
|
# Later we assume the MEDIA_ROOT ends with a trailing slash
|
|
if not root_dir.endswith(os.path.sep):
|
|
root_dir = root_dir + os.path.sep
|
|
|
|
for file in names:
|
|
headers = {}
|
|
|
|
if file in self.FILTER_LIST:
|
|
continue # Skip files we don't want to sync
|
|
|
|
filename = os.path.join(dirname, file)
|
|
if os.path.isdir(filename):
|
|
continue # Don't try to upload directories
|
|
|
|
file_key = filename[len(root_dir):]
|
|
if self.prefix:
|
|
file_key = '%s/%s' % (self.prefix, file_key)
|
|
|
|
# Check if file on S3 is older than local file, if so, upload
|
|
if not self.do_force:
|
|
s3_key = bucket.get_key(file_key)
|
|
if s3_key:
|
|
s3_datetime = datetime.datetime(*time.strptime(
|
|
s3_key.last_modified, '%a, %d %b %Y %H:%M:%S %Z')[0:6])
|
|
local_datetime = datetime.datetime.utcfromtimestamp(
|
|
os.stat(filename).st_mtime)
|
|
if local_datetime < s3_datetime:
|
|
self.skip_count += 1
|
|
if self.verbosity > 1:
|
|
print "File %s hasn't been modified since last " \
|
|
"being uploaded" % (file_key)
|
|
continue
|
|
|
|
# File is newer, let's process and upload
|
|
if self.verbosity > 0:
|
|
print "Uploading %s..." % (file_key)
|
|
|
|
content_type = mimetypes.guess_type(filename)[0]
|
|
if content_type:
|
|
headers['Content-Type'] = content_type
|
|
file_obj = open(filename, 'rb')
|
|
file_size = os.fstat(file_obj.fileno()).st_size
|
|
filedata = file_obj.read()
|
|
if self.do_gzip:
|
|
# Gzipping only if file is large enough (>1K is recommended)
|
|
# and only if file is a common text type (not a binary file)
|
|
if file_size > 1024 and content_type in self.GZIP_CONTENT_TYPES:
|
|
filedata = self.compress_string(filedata)
|
|
headers['Content-Encoding'] = 'gzip'
|
|
if self.verbosity > 1:
|
|
print "\tgzipped: %dk to %dk" % \
|
|
(file_size/1024, len(filedata)/1024)
|
|
if self.do_expires:
|
|
# HTTP/1.0
|
|
headers['Expires'] = '%s GMT' % (email.Utils.formatdate(
|
|
time.mktime((datetime.datetime.now() +
|
|
datetime.timedelta(days=365*2)).timetuple())))
|
|
# HTTP/1.1
|
|
headers['Cache-Control'] = 'max-age %d' % (3600 * 24 * 365 * 2)
|
|
if self.verbosity > 1:
|
|
print "\texpires: %s" % (headers['Expires'])
|
|
print "\tcache-control: %s" % (headers['Cache-Control'])
|
|
|
|
try:
|
|
key.name = file_key
|
|
key.set_contents_from_string(filedata, headers, replace=True)
|
|
key.set_acl('public-read')
|
|
except boto.s3.connection.S3CreateError, e:
|
|
print "Failed: %s" % e
|
|
except Exception, e:
|
|
print e
|
|
raise
|
|
else:
|
|
self.upload_count += 1
|
|
|
|
file_obj.close()
|
|
|
|
# Backwards compatibility for Django r9110
|
|
if not [opt for opt in Command.option_list if opt.dest=='verbosity']:
|
|
Command.option_list += (
|
|
optparse.make_option('-v', '--verbosity',
|
|
dest='verbosity', default=1, action='count',
|
|
help="Verbose mode. Multiple -v options increase the verbosity."),
|
|
)
|