This guide covers security considerations for deploying and using staRburst in production environments.
Never hard-code AWS credentials in your code:
# NEVER DO THIS - credentials exposed in code
Sys.setenv(
AWS_ACCESS_KEY_ID = "AKIA...",
AWS_SECRET_ACCESS_KEY = "wJalr..."
)Why it’s dangerous: - Credentials in code can be committed to version control - Logs may capture environment variables - Code sharing exposes credentials
When running on AWS infrastructure (EC2, ECS, Lambda):
# No credentials needed - automatically uses instance/task IAM role
library(starburst)
plan(starburst, workers = 10)Benefits: - No credential management required - Automatic credential rotation - Fine-grained permissions via IAM policies - Audit trail via CloudTrail
Setup: 1. Create IAM role with required permissions 2. Attach role to EC2 instance or ECS task 3. staRburst automatically discovers and uses role credentials
For local development, use AWS CLI profiles:
# Credentials stored in ~/.aws/credentials
Sys.setenv(AWS_PROFILE = "my-starburst-profile")
library(starburst)
plan(starburst, workers = 10)Setup:
For cross-account access or enhanced security:
library(paws.security.identity)
sts <- paws.security.identity::sts()
# Assume role with MFA
credentials <- sts$assume_role(
RoleArn = "arn:aws:iam::123456789012:role/StarburstRole",
RoleSessionName = "starburst-session",
SerialNumber = "arn:aws:iam::123456789012:mfa/user",
TokenCode = "123456" # MFA token
)
# Use temporary credentials
Sys.setenv(
AWS_ACCESS_KEY_ID = credentials$Credentials$AccessKeyId,
AWS_SECRET_ACCESS_KEY = credentials$Credentials$SecretAccessKey,
AWS_SESSION_TOKEN = credentials$Credentials$SessionToken
)Benefits: - Time-limited credentials (expire after 1-12 hours) - Can require MFA for sensitive operations - Separate credentials for different roles
Create an IAM policy for staRburst workers:
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "StarburstWorkerAccess",
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject",
"s3:DeleteObject",
"s3:ListBucket"
],
"Resource": [
"arn:aws:s3:::my-starburst-bucket",
"arn:aws:s3:::my-starburst-bucket/*"
],
"Condition": {
"StringLike": {
"s3:prefix": ["sessions/*", "tasks/*", "results/*"]
}
}
}
]
}Encrypt data at rest in S3:
# Option 1: Enable default encryption via AWS Console or CLI
library(paws.storage)
s3 <- paws.storage::s3()
s3$put_bucket_encryption(
Bucket = "my-starburst-bucket",
ServerSideEncryptionConfiguration = list(
Rules = list(
list(
ApplyServerSideEncryptionByDefault = list(
SSEAlgorithm = "AES256" # Or "aws:kms" for KMS
),
BucketKeyEnabled = TRUE
)
)
)
)Encryption Options: - AES256 - S3-managed keys (SSE-S3, free) - aws:kms - AWS KMS managed keys (SSE-KMS, more control, costs apply) - aws:kms:dsse - Dual-layer encryption for compliance (SSE-KMS-DSSE)
Protect against accidental deletion:
s3$put_bucket_versioning(
Bucket = "my-starburst-bucket",
VersioningConfiguration = list(
Status = "Enabled"
)
)Benefits: - Recover from accidental deletions - Rollback to previous versions - Required for certain compliance frameworks
Ensure bucket is never publicly accessible:
Restrict access to specific IAM roles:
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "DenyUnencryptedObjectUploads",
"Effect": "Deny",
"Principal": "*",
"Action": "s3:PutObject",
"Resource": "arn:aws:s3:::my-starburst-bucket/*",
"Condition": {
"StringNotEquals": {
"s3:x-amz-server-side-encryption": "AES256"
}
}
},
{
"Sid": "DenyInsecureTransport",
"Effect": "Deny",
"Principal": "*",
"Action": "s3:*",
"Resource": "arn:aws:s3:::my-starburst-bucket/*",
"Condition": {
"Bool": {
"aws:SecureTransport": "false"
}
}
}
]
}Deploy workers in private subnets without internet access:
library(starburst)
# Configure to use private subnets
starburst_config(
subnets = c("subnet-private-1a", "subnet-private-1b"),
security_groups = c("sg-starburst-workers")
)
plan(starburst, workers = 10)Requirements for Private Subnets: - VPC endpoints for S3, ECR, CloudWatch Logs - NAT Gateway if internet access needed for some operations
Minimal security group (no inbound, only outbound):
Avoid internet traffic for AWS service calls:
S3 Gateway Endpoint (free):
aws ec2 create-vpc-endpoint \
--vpc-id vpc-... \
--service-name com.amazonaws.us-east-1.s3 \
--route-table-ids rtb-...ECR Interface Endpoints (charges apply):
# ECR API endpoint
aws ec2 create-vpc-endpoint \
--vpc-id vpc-... \
--service-name com.amazonaws.us-east-1.ecr.api \
--vpc-endpoint-type Interface \
--subnet-ids subnet-... \
--security-group-ids sg-...
# ECR Docker endpoint
aws ec2 create-vpc-endpoint \
--vpc-id vpc-... \
--service-name com.amazonaws.us-east-1.ecr.dkr \
--vpc-endpoint-type Interface \
--subnet-ids subnet-... \
--security-group-ids sg-...CloudWatch Logs Endpoint:
aws ec2 create-vpc-endpoint \
--vpc-id vpc-... \
--service-name com.amazonaws.us-east-1.logs \
--vpc-endpoint-type Interface \
--subnet-ids subnet-... \
--security-group-ids sg-...Benefits: - All AWS API traffic stays within AWS network - Reduced data transfer costs - Better security posture (no internet exposure)
Create AWS Budget to monitor staRburst costs:
staRburst enforces maximum 500 workers per plan:
Regularly check for orphaned sessions:
# List all sessions
sessions <- starburst_list_sessions()
print(sessions)
# Cleanup old sessions
for (session_id in sessions$session_id) {
# Check if session is old
created_date <- as.Date(sessions$created_at[sessions$session_id == session_id])
if (Sys.Date() - created_date > 7) {
cat(sprintf("Cleaning up old session: %s\n", session_id))
session <- starburst_session_attach(session_id)
session$cleanup(stop_workers = TRUE, force = TRUE)
}
}Prevent sessions from running indefinitely:
Before launching large jobs:
# Estimate cost
workers <- 100
cpu <- 4
memory_gb <- 8
runtime_hours <- 2
# Fargate pricing (us-east-1, 2026)
vcpu_price <- 0.04048
memory_price <- 0.004445
cost_per_worker <- (cpu * vcpu_price) + (memory_gb * memory_price)
total_cost <- workers * cost_per_worker * runtime_hours
cat(sprintf("Estimated cost: $%.2f for %d hours\n", total_cost, runtime_hours))
# Estimated cost: $19.84 for 2 hoursCapture all API calls for security auditing:
What CloudTrail Captures: - Who launched staRburst workers (IAM user/role) - When tasks were created/stopped - S3 object access (if data events enabled) - API failures and authorization errors
staRburst automatically logs to CloudWatch:
/aws/ecs/starburst-workerIncrease retention for compliance:
Track all S3 bucket access:
s3$put_bucket_logging(
Bucket = "my-starburst-bucket",
BucketLoggingStatus = list(
LoggingEnabled = list(
TargetBucket = "my-logging-bucket",
TargetPrefix = "starburst-access-logs/"
)
)
)Access logs include: - Who accessed objects - When objects were accessed - What operations were performed - Source IP addresses
Regularly review logs for suspicious activity:
# CloudWatch Insights query for failed authentications
library(paws.management)
logs <- paws.management::cloudwatchlogs()
# Query for errors in last 24 hours
query <- "fields @timestamp, @message
| filter @message like /ERROR|AccessDenied|Forbidden/
| sort @timestamp desc"
start_time <- as.integer(Sys.time() - 86400) # 24 hours ago
end_time <- as.integer(Sys.time())
result <- logs$start_query(
logGroupName = "/aws/ecs/starburst-worker",
startTime = start_time,
endTime = end_time,
queryString = query
)Only send necessary data to workers:
# ✅ Good - only send needed data
large_data <- read.csv("huge_dataset.csv")
sample_data <- large_data[1:1000, ] # Use sample for testing
plan(starburst, workers = 10)
results <- future_map(1:10, function(i) {
# sample_data automatically uploaded as global
analyze(sample_data[i, ])
})
# ❌ Bad - sends entire large dataset
plan(starburst, workers = 10)
results <- future_map(1:10, function(i) {
analyze(large_data[i, ]) # Uploads all of large_data
})# Collect and immediately delete from S3
session <- starburst_session(workers = 10)
task_id <- session$submit(quote(sensitive_computation()))
# Wait for completion
results <- session$collect(wait = TRUE)
# Immediately cleanup (force = TRUE deletes S3 files)
session$cleanup(force = TRUE)
# Save results locally if needed
saveRDS(results, "local_results.rds")For highly sensitive data:
# Encrypt before upload
library(sodium)
# Generate key (store securely, e.g., AWS Secrets Manager)
key <- random(32)
# Encrypt data before passing to future
sensitive_data <- serialize(my_data, NULL)
encrypted_data <- data_encrypt(sensitive_data, key)
# Run computation
plan(starburst, workers = 1)
result <- future({
# Decrypt in worker
decrypted <- data_decrypt(encrypted_data, key)
my_data <- unserialize(decrypted)
# Process...
})staRburst uses multi-stage builds to minimize image size:
Enable ECR image scanning:
Automatically delete old images:
# Delete images older than 30 days
ecr$put_lifecycle_policy(
repositoryName = "starburst-worker",
lifecyclePolicyText = jsonlite::toJSON(list(
rules = list(
list(
rulePriority = 1,
description = "Delete old images",
selection = list(
tagStatus = "any",
countType = "sinceImagePushed",
countUnit = "days",
countNumber = 30
),
action = list(type = "expire")
)
)
), auto_unbox = TRUE)
)For HIPAA-regulated workloads:
For EU data processing:
For SOC 2 compliance:
Immediate Actions: 1. Disable/rotate credentials immediately 2. Review CloudTrail logs for unauthorized activity 3. Check for new resources created by attacker 4. Terminate suspicious ECS tasks 5. Review S3 bucket access logs
Before deploying staRburst to production: