As an infrastructure engineer working with petabytes of S3 data, I've faced multiple scenarios where entire production buckets nearly got deleted - either through human error (like a misconfigured Terraform script) or compromised credentials. The challenge intensifies when:
- Versioning isn't enabled on source buckets (often due to compliance constraints)
- Bucket sizes exceed practical local storage limits
- You need immediate recovery capabilities
Here's my battle-tested approach using S3 Cross-Region Replication (CRR) with separate AWS accounts:
# Terraform configuration for secure backup bucket
resource "aws_s3_bucket" "backup_bucket" {
bucket = "prod-backups-${random_id.backup.hex}"
acl = "private"
versioning {
enabled = true # Always enable versioning for backups
}
lifecycle {
prevent_destroy = true # Critical safeguard
}
server_side_encryption_configuration {
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}
}
# IAM policy that ONLY allows PutObject from source account
data "aws_iam_policy_document" "replication" {
statement {
actions = [
"s3:ReplicateObject",
"s3:ReplicateDelete",
"s3:ReplicateTags"
]
resources = ["${aws_s3_bucket.backup_bucket.arn}/*"]
principals {
identifiers = [aws_iam_role.source_account_role.arn]
type = "AWS"
}
}
}
Backups are worthless if you don't verify them. We implement:
# Python script using boto3 to validate backup integrity
import boto3
from datetime import datetime, timedelta
def verify_backup(source_bucket, backup_bucket):
s3 = boto3.client('s3')
cutoff = datetime.now() - timedelta(hours=24)
# Check object count delta
source_count = sum(1 for _ in s3.list_objects_v2(Bucket=source_bucket)['Contents'])
backup_count = sum(1 for _ in s3.list_objects_v2(Bucket=backup_bucket)['Contents'])
if abs(source_count - backup_count) > (source_count * 0.01): # Allow 1% delta
raise Exception(f"Backup count mismatch: {source_count} vs {backup_count}")
# Verify recent modifications
last_source_mod = max(obj['LastModified'] for obj in
s3.list_objects_v2(Bucket=source_bucket)['Contents'])
last_backup_mod = max(obj['LastModified'] for obj in
s3.list_objects_v2(Bucket=backup_bucket)['Contents'])
if (last_source_mod - last_backup_mod) > timedelta(hours=6):
raise Exception("Backup is stale by more than 6 hours")
Having recovered multiple production buckets, here's our playbook:
- Immediate Account Isolation: Freeze IAM credentials that may have caused deletion
- Backup Validation: Run checksum verification on backup objects
- Parallel Restoration: Use S3 Batch Operations to restore at scale
# AWS CLI commands for mass restoration
# First identify objects needing restoration
BACKUP_BUCKET="prod-backups-xyz"
RESTORE_BUCKET="recovered-prod-bucket"
# Generate manifest of all backup objects
aws s3api list-objects-v2 --bucket $BACKUP_BUCKET \
--query 'Contents[].{Key:Key}' \
--output json > restore-manifest.json
# Create batch operation job
aws s3control create-job \
--account-id YOUR_ACCOUNT_ID \
--operation '{"S3PutObjectCopy": { "TargetResource": "arn:aws:s3:::'$RESTORE_BUCKET'" }}' \
--manifest '{"Spec":{"Format":"S3BatchOperations_CSV_20180820","Fields":["Bucket","Key"]},"Location":{"ObjectArn":"arn:aws:s3:::'$BACKUP_BUCKET'/restore-manifest.json","ETag":"ETAG_VALUE"}}' \
--report '{"Bucket":"arn:aws:s3:::operation-reports","Format":"Report_CSV_20180820","Enabled":true,"Prefix":"restore-reports"}'
For massive buckets (100TB+), we implement tiered backups:
Tier | Storage Class | Retention | Use Case |
---|---|---|---|
Hot | Standard | 7 days | Immediate recovery |
Warm | Standard-IA | 30 days | Recent versions |
Cold | Glacier Deep Archive | 1 year+ | Compliance/audit |
When dealing with mission-critical data in Amazon S3, accidental bucket deletions can be catastrophic. While versioning helps protect individual objects, it doesn't safeguard against the entire bucket being deleted. For buckets containing 100GB+ of data, local backups become impractical.
One robust solution is implementing cross-region replication (CRR) with strict bucket policies:
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Deny",
"Principal": "*",
"Action": "s3:DeleteBucket",
"Resource": "arn:aws:s3:::your-production-bucket"
}
]
}
Combine this with CRR configuration:
aws s3api put-bucket-replication \
--bucket source-bucket \
--replication-configuration '{
"Role": "arn:aws:iam::account-id:role/replication-role",
"Rules": [
{
"Status": "Enabled",
"Priority": 1,
"DeleteMarkerReplication": { "Status": "Disabled" },
"Filter" : { "Prefix": "" },
"Destination": {
"Bucket": "arn:aws:s3:::backup-bucket",
"StorageClass": "STANDARD"
}
}
]
}'
For more granular control, implement a Lambda-based backup system triggered by S3 events:
const AWS = require('aws-sdk');
const s3 = new AWS.S3();
exports.handler = async (event) => {
const sourceBucket = event.Records[0].s3.bucket.name;
const backupBucket = ${sourceBucket}-backup-${new Date().toISOString().split('T')[0]};
try {
await s3.copyObject({
Bucket: backupBucket,
CopySource: ${sourceBucket}/${event.Records[0].s3.object.key},
Key: event.Records[0].s3.object.key
}).promise();
} catch (error) {
console.error('Backup failed:', error);
}
};
Set this up with EventBridge to run daily:
aws events put-rule \
--name "DailyS3Backup" \
--schedule-expression "cron(0 2 * * ? *)"
For buckets exceeding 100GB, S3 Batch Operations provides scalable backup:
aws s3control create-job \
--account-id 123456789012 \
--operation '{"S3PutObjectCopy": {"TargetResource": "arn:aws:s3:::backup-bucket"}}' \
--manifest '{"Spec": {"Format": "S3BatchOperations_CSV_20180820", "Fields": ["Bucket", "Key"]}, "Location": {"ObjectArn": "arn:aws:s3:::source-bucket/manifest.csv", "ETag": "exampleETag"}}' \
--report '{"Bucket": "arn:aws:s3:::backup-reports", "Format": "Report_CSV_20180820", "Enabled": true, "Prefix": "reports", "ReportScope": "AllTasks"}' \
--priority 10 \
--role-arn arn:aws:iam::123456789012:role/batch-ops-role
Implement CloudWatch alarms for suspicious activity:
aws cloudwatch put-metric-alarm \
--alarm-name "S3BucketDeletionAttempt" \
--metric-name "BucketDeleteCount" \
--namespace "AWS/S3" \
--statistic "Sum" \
--period 300 \
--threshold 1 \
--comparison-operator "GreaterThanOrEqualToThreshold" \
--evaluation-periods 1 \
--alarm-actions "arn:aws:sns:us-east-1:123456789012:AdminAlerts"