Rewriting mongo backup, adding in rotation.

This commit is contained in:
Samuel Clay 2021-07-29 18:17:02 -04:00
parent d7c8f5f79c
commit 5aaf4d5c38
4 changed files with 129 additions and 57 deletions

View file

@ -8,7 +8,7 @@ beautifulsoup4==4.9.3
billiard==3.6.4.0
bleach==3.2.1
boto==2.49.0
boto3==1.17.67
boto3==1.18.9
botocore==1.20.67
celery==4.4.7
certifi==2020.12.5

View file

@ -21,11 +21,11 @@ for collection in ${collections[@]}; do
now=$(date '+%Y-%m-%d-%H-%M')
echo "---> Dumping $collection - ${now}"
docker exec -it mongo mongodump -d newsblur -c $collection -o /backup/backup_mongo_${now}
docker exec -it mongo mongodump -d newsblur -c $collection -o /backup/backup_mongo
done;
echo " ---> Compressing backup_mongo_${now}.tgz"
tar -zcf /opt/mongo/newsblur/backup/backup_mongo_${now}.tgz /opt/mongo/newsblur/backup/backup_mongo_${now}
tar -zcf /opt/mongo/newsblur/backup/backup_mongo_${now}.tgz /opt/mongo/newsblur/backup/backup_mongo
echo " ---> Uploading backups to S3"
docker run --rm -v /srv/newsblur:/srv/newsblur -v /opt/mongo/newsblur/backup/:/opt/mongo/newsblur/backup/ --network=newsblurnet newsblur/newsblur_python3:latest python /srv/newsblur/utils/backups/backup_mongo.py
@ -33,4 +33,4 @@ docker run --rm -v /srv/newsblur:/srv/newsblur -v /opt/mongo/newsblur/backup/:/o
# Don't delete backup since the backup_mongo.py script will rm them
## rm /opt/mongo/newsblur/backup/backup_mongo_${now}.tgz
## rm /opt/mongo/newsblur/backup/backup_mongo_${now}
echo " ---> Finished uploading backups to S3: backup_mongo_${now}.tgz"
echo " ---> Finished uploading backups to S3: backup_mongo.tgz"

View file

@ -1,19 +1,130 @@
#!/usr/bin/python3
from datetime import datetime, timedelta
import os
import shutil
from newsblur_web import settings
import re
import logging
import mimetypes
import boto3
from botocore.exceptions import ClientError
import shutil
from boto3.s3.transfer import S3Transfer
from newsblur_web import settings
filenames = [f for f in os.listdir('/opt/mongo/newsblur/backup/') if '.tgz' in f]
logger = logging.getLogger(__name__)
for filename in filenames:
print('Uploading %s to S3...' % filename)
try:
s3 = boto3.client('s3')
s3.upload_file(f"mongo/{filename}", settings.S3_BACKUP_BUCKET)
except ClientError as e:
print(" ****> Exceptions: %s" % e)
shutil.rmtree(filename[:-4])
os.remove(filename)
def main():
BACKUP_DIR = '/opt/mongo/newsblur/backup/'
BACKUP_DIR = '/srv/newsblur'
filenames = [f for f in os.listdir(BACKUP_DIR) if '.tgz' in f]
for filename in filenames:
file_path = os.path.join(BACKUP_DIR, filename)
basename = os.path.basename(file_path)
key_base, key_ext = list(splitext(basename))
key_prefix = "".join(['mongo/', key_base])
key_datestamp = datetime.utcnow().strftime("-%Y-%m-%d-%H-%M")
key = "".join([key_prefix, key_datestamp, key_ext])
print("Uploading {0} to {1}".format(file_path, key))
upload(file_path, settings.S3_BACKUP_BUCKET, key)
print('Rotating file on S3 with key prefix {0} and extension {1}'.format(key_prefix, key_ext))
rotate(key_prefix, key_ext, settings.S3_BACKUP_BUCKET)
# shutil.rmtree(filename[:-4])
# os.remove(filename)
def upload_rotate(file_path, s3_bucket, s3_key_prefix, aws_key=None, aws_secret=None):
'''
Upload file_path to s3 bucket with prefix
Ex. upload_rotate('/tmp/file-2015-01-01.tar.bz2', 'backups', 'foo.net/')
would upload file to bucket backups with key=foo.net/file-2015-01-01.tar.bz2
and then rotate all files starting with foo.net/file and with extension .tar.bz2
Timestamps need to be present between the file root and the extension and in the same format as strftime("%Y-%m-%d").
Ex file-2015-12-28.tar.bz2
'''
key = ''.join([s3_key_prefix, os.path.basename(file_path)])
logger.debug("Uploading {0} to {1}".format(file_path, key))
upload(file_path, s3_bucket, key, aws_access_key_id=aws_key, aws_secret_access_key=aws_secret)
file_root, file_ext = splitext(os.path.basename(file_path))
# strip timestamp from file_base
regex = '(?P<filename>.*)-(?P<year>[\d]+?)-(?P<month>[\d]+?)-(?P<day>[\d]+?)'
match = re.match(regex, file_root)
if not match:
raise Exception('File does not contain a timestamp')
key_prefix = ''.join([s3_key_prefix, match.group('filename')])
logger.debug('Rotating files on S3 with key prefix {0} and extension {1}'.format(key_prefix, file_ext))
rotate(key_prefix, file_ext, s3_bucket, aws_key=aws_key, aws_secret=aws_secret)
def rotate(key_prefix, key_ext, bucket_name, daily_backups=7, weekly_backups=4, aws_key=None, aws_secret=None):
""" Delete old files we've uploaded to S3 according to grandfather, father, sun strategy """
session = boto3.Session(
aws_access_key_id=aws_key,
aws_secret_access_key=aws_secret
)
s3 = session.resource('s3')
bucket = s3.Bucket(bucket_name)
keys = bucket.objects.filter(Prefix=key_prefix)
regex = '{0}-(?P<year>[\d]+?)-(?P<month>[\d]+?)-(?P<day>[\d]+?)-(?P<hour>[\d]+?)-(?P<minute>[\d]+?){1}'.format(key_prefix, key_ext)
backups = []
for key in keys:
match = re.match(regex, str(key.key))
if not match:
continue
year = int(match.group('year'))
month = int(match.group('month'))
day = int(match.group('day'))
hour = int(match.group('hour'))
minute = int(match.group('minute'))
key_date = datetime(year, month, day, hour, minute)
backups[:0] = [key_date]
backups = sorted(backups, reverse=True)
if len(backups) > daily_backups+1 and backups[daily_backups] - backups[daily_backups+1] < timedelta(days=7):
key = bucket.Object("{0}{1}{2}".format(key_prefix,backups[daily_backups].strftime("-%Y-%m-%d-%H-%M"), key_ext))
logger.debug("deleting {0}".format(key))
key.delete()
del backups[daily_backups]
month_offset = daily_backups + weekly_backups
if len(backups) > month_offset+1 and backups[month_offset] - backups[month_offset+1] < timedelta(days=30):
key = bucket.Object("{0}{1}{2}".format(key_prefix,backups[month_offset].strftime("-%Y-%m-%d-%H-%M"), key_ext))
logger.debug("deleting {0}".format(key))
key.delete()
del backups[month_offset]
def splitext( filename ):
""" Return the filename and extension according to the first dot in the filename.
This helps date stamping .tar.bz2 or .ext.gz files properly.
"""
index = filename.find('.')
if index == 0:
index = 1+filename[1:].find('.')
if index == -1:
return filename, ''
return filename[:index], filename[index:]
return os.path.splitext(filename)
def upload(source_path, bucketname, keyname, acl='private', guess_mimetype=True, aws_access_key_id=None, aws_secret_access_key=None):
client = boto3.client('s3', 'us-west-2', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
transfer = S3Transfer(client)
# Upload /tmp/myfile to s3://bucket/key
extra_args = {
'ACL': acl,
}
if guess_mimetype:
mtype = mimetypes.guess_type(keyname)[0] or 'application/octet-stream'
extra_args['ContentType'] = mtype
transfer.upload_file(source_path, bucketname, keyname, extra_args=extra_args)
if __name__ == "__main__":
main()

View file

@ -1,39 +0,0 @@
#!/bin/sh
MONGODB_SHELL='mongo'
DUMP_UTILITY='mongodump'
DB_NAME='newsblur'
COLLECTIONS="classifier_tag classifier_author classifier_feed classifier_title userstories starred_stories"
date_now=`date +%Y_%m_%d_%H_%M`
dir_name='backup_mongo_'${date_now}
file_name='backup_mongo_'${date_now}'.bz2'
log() {
echo $1
}
do_cleanup(){
rm -rf backup_mongo_*
log 'cleaning up....'
}
do_backup(){
log 'snapshotting the db and creating archive'
# ${MONGODB_SHELL} admin fsync_lock.js
for collection in $COLLECTIONS
do
${DUMP_UTILITY} --db ${DB_NAME} --collection $collection -o ${dir_name}
done
tar -jcf $file_name ${dir_name}
# ${MONGODB_SHELL} admin fsync_unlock.js
log 'data backd up and created snapshot'
}
save_in_s3(){
log 'saving the backup archive in amazon S3' && \
python s3.py set ${file_name} && \
log 'data backup saved in amazon s3'
}
do_backup && save_in_s3 && do_cleanup