From 5aaf4d5c38e6a12e8d3459ad2ed229172f6115b0 Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Thu, 29 Jul 2021 18:17:02 -0400 Subject: [PATCH] Rewriting mongo backup, adding in rotation. --- config/requirements.txt | 2 +- docker/mongo/backup_mongo.sh | 6 +- utils/backups/backup_mongo.py | 139 ++++++++++++++++++++++++++++++---- utils/backups/mongo_backup.sh | 39 ---------- 4 files changed, 129 insertions(+), 57 deletions(-) delete mode 100755 utils/backups/mongo_backup.sh diff --git a/config/requirements.txt b/config/requirements.txt index 91f5a171f..2dd5a900f 100644 --- a/config/requirements.txt +++ b/config/requirements.txt @@ -8,7 +8,7 @@ beautifulsoup4==4.9.3 billiard==3.6.4.0 bleach==3.2.1 boto==2.49.0 -boto3==1.17.67 +boto3==1.18.9 botocore==1.20.67 celery==4.4.7 certifi==2020.12.5 diff --git a/docker/mongo/backup_mongo.sh b/docker/mongo/backup_mongo.sh index e57c0126c..a0fa0fd84 100644 --- a/docker/mongo/backup_mongo.sh +++ b/docker/mongo/backup_mongo.sh @@ -21,11 +21,11 @@ for collection in ${collections[@]}; do now=$(date '+%Y-%m-%d-%H-%M') echo "---> Dumping $collection - ${now}" - docker exec -it mongo mongodump -d newsblur -c $collection -o /backup/backup_mongo_${now} + docker exec -it mongo mongodump -d newsblur -c $collection -o /backup/backup_mongo done; echo " ---> Compressing backup_mongo_${now}.tgz" -tar -zcf /opt/mongo/newsblur/backup/backup_mongo_${now}.tgz /opt/mongo/newsblur/backup/backup_mongo_${now} +tar -zcf /opt/mongo/newsblur/backup/backup_mongo_${now}.tgz /opt/mongo/newsblur/backup/backup_mongo echo " ---> Uploading backups to S3" docker run --rm -v /srv/newsblur:/srv/newsblur -v /opt/mongo/newsblur/backup/:/opt/mongo/newsblur/backup/ --network=newsblurnet newsblur/newsblur_python3:latest python /srv/newsblur/utils/backups/backup_mongo.py @@ -33,4 +33,4 @@ docker run --rm -v /srv/newsblur:/srv/newsblur -v /opt/mongo/newsblur/backup/:/o # Don't delete backup since the backup_mongo.py script will rm them ## rm /opt/mongo/newsblur/backup/backup_mongo_${now}.tgz ## rm /opt/mongo/newsblur/backup/backup_mongo_${now} -echo " ---> Finished uploading backups to S3: backup_mongo_${now}.tgz" +echo " ---> Finished uploading backups to S3: backup_mongo.tgz" diff --git a/utils/backups/backup_mongo.py b/utils/backups/backup_mongo.py index 998a228f2..a7bb28034 100755 --- a/utils/backups/backup_mongo.py +++ b/utils/backups/backup_mongo.py @@ -1,19 +1,130 @@ #!/usr/bin/python3 +from datetime import datetime, timedelta import os -import shutil - -from newsblur_web import settings +import re +import logging +import mimetypes import boto3 -from botocore.exceptions import ClientError +import shutil +from boto3.s3.transfer import S3Transfer +from newsblur_web import settings -filenames = [f for f in os.listdir('/opt/mongo/newsblur/backup/') if '.tgz' in f] +logger = logging.getLogger(__name__) -for filename in filenames: - print('Uploading %s to S3...' % filename) - try: - s3 = boto3.client('s3') - s3.upload_file(f"mongo/{filename}", settings.S3_BACKUP_BUCKET) - except ClientError as e: - print(" ****> Exceptions: %s" % e) - shutil.rmtree(filename[:-4]) - os.remove(filename) + +def main(): + BACKUP_DIR = '/opt/mongo/newsblur/backup/' + BACKUP_DIR = '/srv/newsblur' + filenames = [f for f in os.listdir(BACKUP_DIR) if '.tgz' in f] + for filename in filenames: + file_path = os.path.join(BACKUP_DIR, filename) + basename = os.path.basename(file_path) + key_base, key_ext = list(splitext(basename)) + key_prefix = "".join(['mongo/', key_base]) + key_datestamp = datetime.utcnow().strftime("-%Y-%m-%d-%H-%M") + key = "".join([key_prefix, key_datestamp, key_ext]) + + print("Uploading {0} to {1}".format(file_path, key)) + upload(file_path, settings.S3_BACKUP_BUCKET, key) + + print('Rotating file on S3 with key prefix {0} and extension {1}'.format(key_prefix, key_ext)) + rotate(key_prefix, key_ext, settings.S3_BACKUP_BUCKET) + + # shutil.rmtree(filename[:-4]) + # os.remove(filename) + + +def upload_rotate(file_path, s3_bucket, s3_key_prefix, aws_key=None, aws_secret=None): + ''' + Upload file_path to s3 bucket with prefix + Ex. upload_rotate('/tmp/file-2015-01-01.tar.bz2', 'backups', 'foo.net/') + would upload file to bucket backups with key=foo.net/file-2015-01-01.tar.bz2 + and then rotate all files starting with foo.net/file and with extension .tar.bz2 + Timestamps need to be present between the file root and the extension and in the same format as strftime("%Y-%m-%d"). + Ex file-2015-12-28.tar.bz2 + ''' + key = ''.join([s3_key_prefix, os.path.basename(file_path)]) + logger.debug("Uploading {0} to {1}".format(file_path, key)) + upload(file_path, s3_bucket, key, aws_access_key_id=aws_key, aws_secret_access_key=aws_secret) + + file_root, file_ext = splitext(os.path.basename(file_path)) + # strip timestamp from file_base + regex = '(?P.*)-(?P[\d]+?)-(?P[\d]+?)-(?P[\d]+?)' + match = re.match(regex, file_root) + if not match: + raise Exception('File does not contain a timestamp') + key_prefix = ''.join([s3_key_prefix, match.group('filename')]) + logger.debug('Rotating files on S3 with key prefix {0} and extension {1}'.format(key_prefix, file_ext)) + rotate(key_prefix, file_ext, s3_bucket, aws_key=aws_key, aws_secret=aws_secret) + + +def rotate(key_prefix, key_ext, bucket_name, daily_backups=7, weekly_backups=4, aws_key=None, aws_secret=None): + """ Delete old files we've uploaded to S3 according to grandfather, father, sun strategy """ + + session = boto3.Session( + aws_access_key_id=aws_key, + aws_secret_access_key=aws_secret + ) + s3 = session.resource('s3') + bucket = s3.Bucket(bucket_name) + keys = bucket.objects.filter(Prefix=key_prefix) + + regex = '{0}-(?P[\d]+?)-(?P[\d]+?)-(?P[\d]+?)-(?P[\d]+?)-(?P[\d]+?){1}'.format(key_prefix, key_ext) + backups = [] + + for key in keys: + match = re.match(regex, str(key.key)) + if not match: + continue + year = int(match.group('year')) + month = int(match.group('month')) + day = int(match.group('day')) + hour = int(match.group('hour')) + minute = int(match.group('minute')) + key_date = datetime(year, month, day, hour, minute) + backups[:0] = [key_date] + backups = sorted(backups, reverse=True) + + if len(backups) > daily_backups+1 and backups[daily_backups] - backups[daily_backups+1] < timedelta(days=7): + key = bucket.Object("{0}{1}{2}".format(key_prefix,backups[daily_backups].strftime("-%Y-%m-%d-%H-%M"), key_ext)) + logger.debug("deleting {0}".format(key)) + key.delete() + del backups[daily_backups] + + month_offset = daily_backups + weekly_backups + if len(backups) > month_offset+1 and backups[month_offset] - backups[month_offset+1] < timedelta(days=30): + key = bucket.Object("{0}{1}{2}".format(key_prefix,backups[month_offset].strftime("-%Y-%m-%d-%H-%M"), key_ext)) + logger.debug("deleting {0}".format(key)) + key.delete() + del backups[month_offset] + + +def splitext( filename ): + """ Return the filename and extension according to the first dot in the filename. + This helps date stamping .tar.bz2 or .ext.gz files properly. + """ + index = filename.find('.') + if index == 0: + index = 1+filename[1:].find('.') + if index == -1: + return filename, '' + return filename[:index], filename[index:] + return os.path.splitext(filename) + +def upload(source_path, bucketname, keyname, acl='private', guess_mimetype=True, aws_access_key_id=None, aws_secret_access_key=None): + + client = boto3.client('s3', 'us-west-2', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) + transfer = S3Transfer(client) + # Upload /tmp/myfile to s3://bucket/key + extra_args = { + 'ACL': acl, + } + if guess_mimetype: + mtype = mimetypes.guess_type(keyname)[0] or 'application/octet-stream' + extra_args['ContentType'] = mtype + + transfer.upload_file(source_path, bucketname, keyname, extra_args=extra_args) + + +if __name__ == "__main__": + main() diff --git a/utils/backups/mongo_backup.sh b/utils/backups/mongo_backup.sh deleted file mode 100755 index 5938db7b2..000000000 --- a/utils/backups/mongo_backup.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/sh - -MONGODB_SHELL='mongo' -DUMP_UTILITY='mongodump' -DB_NAME='newsblur' -COLLECTIONS="classifier_tag classifier_author classifier_feed classifier_title userstories starred_stories" - -date_now=`date +%Y_%m_%d_%H_%M` -dir_name='backup_mongo_'${date_now} -file_name='backup_mongo_'${date_now}'.bz2' - -log() { - echo $1 -} - -do_cleanup(){ - rm -rf backup_mongo_* - log 'cleaning up....' -} - -do_backup(){ - log 'snapshotting the db and creating archive' - # ${MONGODB_SHELL} admin fsync_lock.js - for collection in $COLLECTIONS - do - ${DUMP_UTILITY} --db ${DB_NAME} --collection $collection -o ${dir_name} - done - tar -jcf $file_name ${dir_name} - # ${MONGODB_SHELL} admin fsync_unlock.js - log 'data backd up and created snapshot' -} - -save_in_s3(){ - log 'saving the backup archive in amazon S3' && \ - python s3.py set ${file_name} && \ - log 'data backup saved in amazon s3' -} - -do_backup && save_in_s3 && do_cleanup